# Pruebas iniciales con RLLIB y RAY
### Javier Guzmán Muñoz

In [1]:
#imports necesarios
import ray
import ray.rllib.agents.ppo as ppo
import json, os, shutil, sys
import gym
import pprint
import time
import shelve
from tensorflow import keras
from ray import tune

Instructions for updating:
non-resource variables are not supported in the long term


Al inicializar ray podemos configurar varios parámetros.
Entre ellos:
- `local_mode=True` para no distribuir el trabajo entre workers paralelos.
- `_metrics_export_port` el puerto donde se van a exportar las métricas que podemos ver en el Dashboard. Las podemos visualizar y graficar con prometheus https://docs.ray.io/en/master/ray-metrics.html
- `num_cpus`, `num_gpus` para establecer los recursos sobre los que queremos trabajar. En este caso, tenemos 4 CPUs (los 4 cores de mi laptop) y ninguna GPU.
-`ignore_reinit_error=True` para que no nos de error al hacer `ray.init` si ray ya estaba inicializado. En ese caso no se vuelve a inicializar ray.

Enlace a la documentación de etsa función: https://docs.ray.io/en/master/package-ref.html

In [23]:
#inicializamos ray
ray.shutdown()

# Si no ponemos un puerto para exportar las métricas se pone uno aleatorio.
ray.init(ignore_reinit_error=True, _metrics_export_port=63419)

2020-11-19 18:24:26,266	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '10.10.1.128',
 'raylet_ip_address': '10.10.1.128',
 'redis_address': '10.10.1.128:6379',
 'object_store_address': '/tmp/ray/session_2020-11-19_18-24-24_611567_209/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2020-11-19_18-24-24_611567_209/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2020-11-19_18-24-24_611567_209',
 'metrics_export_port': 63419,
 'node_id': '31cd14c23ee3ba5c053c3dc874859cdf1a79b7c5'}

### Caso Básico: Problema basado en texto: Taxi
Ejemplo de https://medium.com/distributed-computing-with-ray/intro-to-rllib-example-environments-3a113f532c70

In [25]:
#Directorio donde guardaremos checkpoints
CHECKPOINT_ROOT = "/tmp/ppo/taxi"
shutil.rmtree(CHECKPOINT_ROOT, ignore_errors=True, onerror=None)

#Directorio donde se guardan los resultados de cada sesión de ray (los podemos ver con tensorboard)
ray_results =os.getenv("HOME")+ "/ray_results"
shutil.rmtree(ray_results, ignore_errors=True, onerror=None)

#Entorno: problema del taxi (https://gym.openai.com/envs/Taxi-v3/)
SELECT_ENV = "Taxi-v3"

config = ppo.DEFAULT_CONFIG.copy()
config["log_level"] = "WARN"
print("Configuración del agente:\n\n" + str(config))
print("\nConfiguración del modelo del agente:\n\n" + str(config["model"]))

Configuración del agente:

{'num_workers': 2, 'num_envs_per_worker': 1, 'create_env_on_driver': False, 'rollout_fragment_length': 200, 'batch_mode': 'truncate_episodes', 'num_gpus': 0, 'train_batch_size': 4000, 'model': {'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'free_log_std': False, 'no_final_linear': False, 'vf_share_layers': True, 'use_lstm': False, 'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action_reward': False, '_time_major': False, 'framestack': True, 'dim': 84, 'grayscale': False, 'zero_mean': True, 'custom_model': None, 'custom_model_config': {}, 'custom_action_dist': None, 'custom_preprocessor': None}, 'optimizer': {}, 'gamma': 0.99, 'horizon': None, 'soft_horizon': False, 'no_done_at_end': False, 'env_config': {}, 'env': None, 'normalize_actions': False, 'clip_rewards': None, 'clip_actions': True, 'preprocessor_pref': 'deepmind', 'lr': 5e-05, 'monitor': False, 'log_level': 'WARN', 'callbacks': <cl

In [26]:
#Configuración del agente
agent = ppo.PPOTrainer(config, env=SELECT_ENV)

[2m[36m(pid=1466)[0m Instructions for updating:
[2m[36m(pid=1466)[0m non-resource variables are not supported in the long term
[2m[36m(pid=1468)[0m Instructions for updating:
[2m[36m(pid=1468)[0m non-resource variables are not supported in the long term


Una ve creado el agente, cada etapa de su entrenamiento genera unos resultados entre los que podemos estraerinformación muy útil, como estaísiticos sobre las recompensas, la longitud de los episodios o distintas mediciones de tiempos. En la siguiente celda se puede ver un ejemplo de ello para el agente del taxi, donde vemos los valores de los timers, que nos permiten ver, entre otros, el tiempo de aprendizaje de cada iteración de train(), por ejemplo.

In [None]:
result = agent.train()
pprint.pprint(result)

#Observar los valores de timers

In [27]:
#Entrenamos el modelo con 30 iteraciones, llamando al método train sobre agent
N_ITER = 30
s = "{:3d} reward {:6.2f}/{:6.2f}/{:6.2f} len {:6.2f} learn_time(ms) {:6.2f} saved {}"

results = []
episode_data = []
episode_json = []

total_learn_time = 0
for n in range(N_ITER):
    result = agent.train()
    results.append(result)
    episode = {'n': n,
               'episode_reward_min': result['episode_reward_min'],
               'episode_reward_mean': result['episode_reward_mean'],
               'episode_reward_max': result['episode_reward_max'],
               'episode_len_mean': result['episode_len_mean'],
               'learn_time_ms': result['timers']['learn_time_ms']}
    episode_data.append(episode)
    episode_json.append(json.dumps(episode))
    file_name = agent.save(CHECKPOINT_ROOT)
    print(s.format(
    n + 1,
    result["episode_reward_min"],
    result["episode_reward_mean"],
    result["episode_reward_max"],
    result["episode_len_mean"],
    result["timers"]["learn_time_ms"],
    file_name
   ))
    total_learn_time+= result["timers"]["learn_time_ms"]
    
print("Total learn time: " + str(total_learn_time))
print("Average learn time per iteration: " + str(total_learn_time/N_ITER))
    

Instructions for updating:
Prefer Variable.assign which has equivalent behavior in 2.X.


[2m[36m(pid=1468)[0m Instructions for updating:
[2m[36m(pid=1468)[0m Prefer Variable.assign which has equivalent behavior in 2.X.
[2m[36m(pid=1466)[0m Instructions for updating:
[2m[36m(pid=1466)[0m Prefer Variable.assign which has equivalent behavior in 2.X.


  1 reward -911.00/-768.45/-391.00 len 196.20 learn_time(ms) 4551.36 saved /tmp/ppo/taxi/checkpoint_1/checkpoint-1
  2 reward -911.00/-725.29/-99.00 len 192.17 learn_time(ms) 4207.62 saved /tmp/ppo/taxi/checkpoint_2/checkpoint-2
  3 reward -911.00/-689.68/-99.00 len 191.63 learn_time(ms) 3973.75 saved /tmp/ppo/taxi/checkpoint_3/checkpoint-3
  4 reward -911.00/-655.88/-99.00 len 191.24 learn_time(ms) 3852.65 saved /tmp/ppo/taxi/checkpoint_4/checkpoint-4
  5 reward -911.00/-621.44/-13.00 len 189.41 learn_time(ms) 3780.21 saved /tmp/ppo/taxi/checkpoint_5/checkpoint-5
  6 reward -857.00/-556.93/-13.00 len 185.26 learn_time(ms) 3795.93 saved /tmp/ppo/taxi/checkpoint_6/checkpoint-6
  7 reward -794.00/-470.82/-13.00 len 172.53 learn_time(ms) 3798.64 saved /tmp/ppo/taxi/checkpoint_7/checkpoint-7
  8 reward -794.00/-435.44/-13.00 len 168.53 learn_time(ms) 3799.52 saved /tmp/ppo/taxi/checkpoint_8/checkpoint-8
  9 reward -686.00/-397.51/-37.00 len 163.03 learn_time(ms) 3801.69 saved /tmp/ppo/taxi

In [33]:
#Visualizamos algunos datos del modelo entrenado

policy = agent.get_policy()
model = policy.model
print(model.base_model.summary())
keras.utils.plot_model(model.base_model, "taxi_model.png", show_shapes=True)

print("Variables")
pprint.pprint(model.variables())
print("Value function")
pprint.pprint(model.value_function())
 
#Linea de tiempo que se puede abrir en chrome://tracing y se ve el reparto de tareas y las tareas ejecutasdas por los workers
# así como medir tiempos
ray.timeline("/mnt/c/Users/javig/timelines/time_taxi.json")

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observations (InputLayer)       [(None, 500)]        0                                            
__________________________________________________________________________________________________
fc_1 (Dense)                    (None, 256)          128256      observations[0][0]               
__________________________________________________________________________________________________
fc_value_1 (Dense)              (None, 256)          128256      observations[0][0]               
__________________________________________________________________________________________________
fc_2 (Dense)                    (None, 256)          65792       fc_1[0][0]                       
_______________________________________________________________________________________

In [34]:
ray.shutdown()

Hemos entrenado un modelo con una capa input con una dimensión concreta, en este caso de tamaño 50, por lo que para probar el modelo necesitamos un entorno que nos proporcione entradas de estas características. Al ser entornos tan concretos no tenemos ninguno que reúna estos requisitos, por lo que lo vamos a probar con el mismo prolema con el que le hemos entrenado.

Ejecutamos un rollout de 10 episodios (esto es, tomamos acciones en los modelos hasta llegaral estado de Done=True 10 veces). Cada episodio tiene una longitud arbitraria. Guardamos toda la info posible en el fichero taxi.pkl.
Medimos el tiempo del rollout en conjunto pues al ser un comando predefinido (en verdad estamos ejecutando el script rollout.py) de momento no vamos a hacer más. Más adelante veremos como precisar esto un poco más. este tiempo es bastante improciso pues cuenta también el tiempo de inicializar ray o escribir los datos, entre otros

El problema del taxi es un entorno denominado 'ToyText', es decir que su salida es únicamente "un texto". Sus estados se codifican como un entero del 0 al 499 y sus acciones con otro entero del 0 al 5 (moverse en las 4 direcciones, dejar o coger un pasajero).

Las recompensas son -1 en cualquier acción salvo si es un intento de coger o soltar al pasajero ilegalmente (porque el pasajero no está donde está el taxi o porque lo intenta soltra cuando no lleva pasajero o en un destino al que no quiere ir) y 20 si sulta al pasajero correctamente (en su posición)

In [None]:
t0=time.time()
!rllib rollout /tmp/ppo/taxi/checkpoint_30/checkpoint-30 --run PPO --env=Taxi-v3 --episodes 10  --out 'taxi.pkl' --save-info --use-shelve --track-progress --video-dir /mnt/c/Users/javig/videoGym
t1=time.time()-t0
print("El rollout ha tardado {} segundos".format(t1))

El fichero de salida está en formato shelve pickle y con un shleve para cada rollout (para cada episodio). En el propio fichero
fuente de rollout.py se nos indica como procesar esta infromación.

Lo que obtenemos es, para cada episodio, el número del mismo y una lista con tantos elementos (que son a su vez listas) como pasos hayamos dado en ese episodio con iformación acerca de los mismos. Esta información es:
- `obs`: estado observado antes de tomar la acción.
- `action`: acción tomada en base al estado observado, la última acción tomada y la última recompensa obtenida
- `next_obs`: el estado observado tras tomar la acción y que será el estado incial del siguiente elemento de la lista
- `done`: booleano que inidca si estamos en un estado 'Done': esto es si hemos conseguido el objetivo del juego o si hemos agotado algún temporizador.
-`info`: este campo sólo se incluye si hemos activado el flag `--save-info`e incluye información sobre el entorno y el problema concreto. En este caso, cuando hemos llegado a un estado 'Done' por agotra un temporizador se nos informa en este diccionario de ello.

In [None]:
# El fichero está en formato shelve pickle y con un shleve para cada rollout (para cada episodio). En el propio fichero
# fuente de rollout.py se nos indica como procesar esta infromación
with shelve.open('taxi.pkl') as rollouts:
    for episode_index in range(rollouts["num_episodes"]):
        rollout = rollouts[str(episode_index)]
        print(str(episode_index))
        pprint.pprint(rollout)


### Función genérica para el entrenamiento.
A continuación, basada en el ejemplo anterior, esta función ejecuta iteraciones de entrenamiento sobre un entorno indicado guardando checkpoints en una ruta especificada y con la configuración elegida por nosotros también. Los parámetros que recibe son:
- `checkpoint_root`: directorio en el que queremos guardar nuestros checkpoints (uno por cada iteración)
- `env`: entorno gym con el que crearemos el agente
- `config`: configuración para nuestro agente: aquí podemos indicar las capas del modelo
- `n_iter`: número de iteraciones a realizar

In [2]:
def full_train(checkpoint_root, env, config, n_iter):
    shutil.rmtree(checkpoint_root, ignore_errors=True, onerror=None)
    agent = ppo.PPOTrainer(config=config, env=env)
    s = "{:3d} reward {:6.2f}/{:6.2f}/{:6.2f} len {:6.2f} learn_time(ms) {:6.2f} saved {}"
    
    

    results = []
    episode_data = []
    episode_json = []

    total_learn_time = 0
    for n in range(n_iter):
        result = agent.train()
        results.append(result)
        episode = {'n': n,
                   'episode_reward_min': result['episode_reward_min'],
                   'episode_reward_mean': result['episode_reward_mean'],
                   'episode_reward_max': result['episode_reward_max'],
                   'episode_len_mean': result['episode_len_mean'],
                   'learn_time_ms': result['timers']['learn_time_ms']}
        episode_data.append(episode)
        episode_json.append(json.dumps(episode))
        file_name = agent.save(checkpoint_root)
        print(s.format(
        n + 1,
        result["episode_reward_min"],
        result["episode_reward_mean"],
        result["episode_reward_max"],
        result["episode_len_mean"],
        result["timers"]["learn_time_ms"],
        file_name
       ))
        total_learn_time+= result["timers"]["learn_time_ms"]

    print("Total learn time: " + str(total_learn_time))
    print("Average learn time per iteration: " + str(total_learn_time/n_iter))
    policy = agent.get_policy()
    model = policy.model
    print(model.base_model.summary())

    print("Variables")
    pprint.pprint(model.variables())
    print("Value function")
    pprint.pprint(model.value_function())
    keras.utils.plot_model(model.base_model, "taxi_model.png", show_shapes=True)

#### Modelo con dos capas internas de 512 neuronas cada una

In [42]:
ray.shutdown()
ray.init(ignore_reinit_error=True, _metrics_export_port=63419)
checkpoint_root = '/tmp/ppo/taxi_2'
env = 'Taxi-v3'
config = ppo.DEFAULT_CONFIG.copy()
config['model']['fcnet_hiddens'] = [512,512]
n_iter = 30
full_train(checkpoint_root, env, config, n_iter)
ray.timeline("/mnt/c/Users/javig/timelines/time_taxi.json")

2020-11-19 19:18:23,380	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
[2m[36m(pid=2954)[0m Instructions for updating:
[2m[36m(pid=2954)[0m non-resource variables are not supported in the long term
[2m[36m(pid=2960)[0m Instructions for updating:
[2m[36m(pid=2960)[0m non-resource variables are not supported in the long term
[2m[36m(pid=2954)[0m Instructions for updating:
[2m[36m(pid=2954)[0m Prefer Variable.assign which has equivalent behavior in 2.X.
[2m[36m(pid=2960)[0m Instructions for updating:
[2m[36m(pid=2960)[0m Prefer Variable.assign which has equivalent behavior in 2.X.


  1 reward -929.00/-775.55/-303.00 len 193.55 learn_time(ms) 7758.57 saved /tmp/ppo/taxi_2/checkpoint_1/checkpoint-1
  2 reward -929.00/-777.58/-303.00 len 196.78 learn_time(ms) 7717.17 saved /tmp/ppo/taxi_2/checkpoint_2/checkpoint-2
  3 reward -929.00/-757.42/-303.00 len 197.32 learn_time(ms) 8252.94 saved /tmp/ppo/taxi_2/checkpoint_3/checkpoint-3
  4 reward -929.00/-728.15/-233.00 len 194.93 learn_time(ms) 8431.28 saved /tmp/ppo/taxi_2/checkpoint_4/checkpoint-4
  5 reward -929.00/-713.07/-233.00 len 194.79 learn_time(ms) 8523.70 saved /tmp/ppo/taxi_2/checkpoint_5/checkpoint-5
  6 reward -911.00/-685.78/-233.00 len 194.80 learn_time(ms) 8526.81 saved /tmp/ppo/taxi_2/checkpoint_6/checkpoint-6
  7 reward -911.00/-656.82/-233.00 len 194.76 learn_time(ms) 8577.41 saved /tmp/ppo/taxi_2/checkpoint_7/checkpoint-7
  8 reward -911.00/-628.68/-203.00 len 192.06 learn_time(ms) 8595.07 saved /tmp/ppo/taxi_2/checkpoint_8/checkpoint-8
  9 reward -1028.00/-613.69/-76.00 len 191.68 learn_time(ms) 865

#### Modelo con dos capas internas de 1024 neurnas

In [43]:
ray.shutdown()
ray.init(ignore_reinit_error=True, _metrics_export_port=63419)
checkpoint_root = '/tmp/ppo/taxi_3'
env = 'Taxi-v3'
config = ppo.DEFAULT_CONFIG.copy()
config['model']['fcnet_hiddens'] = [1024,1024]
n_iter = 30
full_train(checkpoint_root, env, config, n_iter)
ray.timeline("/mnt/c/Users/javig/timelines/time_taxi.json")

2020-11-19 19:25:17,303	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
[2m[36m(pid=3130)[0m Instructions for updating:
[2m[36m(pid=3130)[0m non-resource variables are not supported in the long term
[2m[36m(pid=3132)[0m Instructions for updating:
[2m[36m(pid=3132)[0m non-resource variables are not supported in the long term
[2m[36m(pid=3130)[0m Instructions for updating:
[2m[36m(pid=3130)[0m Prefer Variable.assign which has equivalent behavior in 2.X.
[2m[36m(pid=3132)[0m Instructions for updating:
[2m[36m(pid=3132)[0m Prefer Variable.assign which has equivalent behavior in 2.X.


  1 reward -830.00/-755.70/-469.00 len 197.40 learn_time(ms) 25632.94 saved /tmp/ppo/taxi_3/checkpoint_1/checkpoint-1
  2 reward -866.00/-764.77/-469.00 len 198.38 learn_time(ms) 25892.08 saved /tmp/ppo/taxi_3/checkpoint_2/checkpoint-2
  3 reward -965.00/-765.42/-464.00 len 197.67 learn_time(ms) 25895.62 saved /tmp/ppo/taxi_3/checkpoint_3/checkpoint-3
  4 reward -965.00/-754.50/-455.00 len 197.62 learn_time(ms) 25926.15 saved /tmp/ppo/taxi_3/checkpoint_4/checkpoint-4
  5 reward -965.00/-744.88/-212.00 len 195.58 learn_time(ms) 26048.74 saved /tmp/ppo/taxi_3/checkpoint_5/checkpoint-5
  6 reward -965.00/-732.06/-212.00 len 193.32 learn_time(ms) 26050.15 saved /tmp/ppo/taxi_3/checkpoint_6/checkpoint-6
  7 reward -1010.00/-701.97/-212.00 len 191.46 learn_time(ms) 25881.59 saved /tmp/ppo/taxi_3/checkpoint_7/checkpoint-7
  8 reward -1010.00/-677.99/-170.00 len 188.51 learn_time(ms) 25789.05 saved /tmp/ppo/taxi_3/checkpoint_8/checkpoint-8
  9 reward -1010.00/-646.23/-166.00 len 183.93 learn_t

#### Modelo con tres capas internas de 256 neuronas

In [44]:
ray.shutdown()
ray.init(ignore_reinit_error=True, _metrics_export_port=63419)
checkpoint_root = '/tmp/ppo/taxi_2'
env = 'Taxi-v3'
config = ppo.DEFAULT_CONFIG.copy()
config['model']['fcnet_hiddens'] = [256,256,256]
n_iter = 30
full_train(checkpoint_root, env, config, n_iter)
ray.timeline("/mnt/c/Users/javig/timelines/time_taxi.json")

2020-11-19 19:41:08,392	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
[2m[36m(pid=4074)[0m Instructions for updating:
[2m[36m(pid=4074)[0m non-resource variables are not supported in the long term
[2m[36m(pid=4076)[0m Instructions for updating:
[2m[36m(pid=4076)[0m non-resource variables are not supported in the long term
[2m[36m(pid=4076)[0m Instructions for updating:
[2m[36m(pid=4076)[0m Prefer Variable.assign which has equivalent behavior in 2.X.
[2m[36m(pid=4074)[0m Instructions for updating:
[2m[36m(pid=4074)[0m Prefer Variable.assign which has equivalent behavior in 2.X.


  1 reward -929.00/-794.00/-677.00 len 200.00 learn_time(ms) 5309.74 saved /tmp/ppo/taxi_2/checkpoint_1/checkpoint-1
  2 reward -929.00/-765.15/-369.00 len 195.90 learn_time(ms) 5407.62 saved /tmp/ppo/taxi_2/checkpoint_2/checkpoint-2
  3 reward -929.00/-767.27/-369.00 len 197.27 learn_time(ms) 5661.07 saved /tmp/ppo/taxi_2/checkpoint_3/checkpoint-3
  4 reward -929.00/-762.02/-369.00 len 197.95 learn_time(ms) 5596.58 saved /tmp/ppo/taxi_2/checkpoint_4/checkpoint-4
  5 reward -929.00/-756.42/-369.00 len 198.18 learn_time(ms) 5504.32 saved /tmp/ppo/taxi_2/checkpoint_5/checkpoint-5
  6 reward -965.00/-722.41/-118.00 len 194.38 learn_time(ms) 5443.56 saved /tmp/ppo/taxi_2/checkpoint_6/checkpoint-6
  7 reward -965.00/-692.99/-81.00 len 191.09 learn_time(ms) 5390.56 saved /tmp/ppo/taxi_2/checkpoint_7/checkpoint-7
  8 reward -965.00/-644.77/-37.00 len 182.83 learn_time(ms) 5350.81 saved /tmp/ppo/taxi_2/checkpoint_8/checkpoint-8
  9 reward -965.00/-593.79/-37.00 len 174.27 learn_time(ms) 5403.7

#### Modelo con tres capas internas de 512 neuronas

In [45]:
ray.shutdown()
ray.init(ignore_reinit_error=True, _metrics_export_port=63419)
checkpoint_root = '/tmp/ppo/taxi_4'
env = 'Taxi-v3'
config = ppo.DEFAULT_CONFIG.copy()
config['model']['fcnet_hiddens'] = [512,512,512]
n_iter = 30
full_train(checkpoint_root, env, config, n_iter)
ray.timeline("/mnt/c/Users/javig/timelines/time_taxi.json")

2020-11-19 19:45:29,985	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
[2m[36m(pid=4272)[0m Instructions for updating:
[2m[36m(pid=4272)[0m non-resource variables are not supported in the long term
[2m[36m(pid=4274)[0m Instructions for updating:
[2m[36m(pid=4274)[0m non-resource variables are not supported in the long term
[2m[36m(pid=4272)[0m Instructions for updating:
[2m[36m(pid=4272)[0m Prefer Variable.assign which has equivalent behavior in 2.X.
[2m[36m(pid=4274)[0m Instructions for updating:
[2m[36m(pid=4274)[0m Prefer Variable.assign which has equivalent behavior in 2.X.


  1 reward -956.00/-792.20/-695.00 len 200.00 learn_time(ms) 13189.22 saved /tmp/ppo/taxi_4/checkpoint_1/checkpoint-1
  2 reward -956.00/-773.80/-175.00 len 196.75 learn_time(ms) 13697.33 saved /tmp/ppo/taxi_4/checkpoint_2/checkpoint-2
  3 reward -956.00/-758.18/-175.00 len 197.83 learn_time(ms) 13476.13 saved /tmp/ppo/taxi_4/checkpoint_3/checkpoint-3
  4 reward -956.00/-740.06/-175.00 len 196.62 learn_time(ms) 13430.25 saved /tmp/ppo/taxi_4/checkpoint_4/checkpoint-4
  5 reward -956.00/-738.22/-175.00 len 196.72 learn_time(ms) 13395.33 saved /tmp/ppo/taxi_4/checkpoint_5/checkpoint-5
  6 reward -956.00/-714.26/-217.00 len 196.67 learn_time(ms) 13362.52 saved /tmp/ppo/taxi_4/checkpoint_6/checkpoint-6
  7 reward -956.00/-678.76/-100.00 len 194.11 learn_time(ms) 13376.89 saved /tmp/ppo/taxi_4/checkpoint_7/checkpoint-7
  8 reward -983.00/-654.51/-100.00 len 190.86 learn_time(ms) 13333.97 saved /tmp/ppo/taxi_4/checkpoint_8/checkpoint-8
  9 reward -983.00/-647.96/-100.00 len 189.62 learn_time

#### Modelo con tres capas internas de 1024 neuronas

In [5]:
ray.shutdown()
ray.init(ignore_reinit_error=True, _metrics_export_port=63419)
checkpoint_root = '/tmp/ppo/taxi_5'
env = 'Taxi-v3'
config = ppo.DEFAULT_CONFIG.copy()
config['model']['fcnet_hiddens'] = [1024,1024,1024]
n_iter = 30
full_train(checkpoint_root, env, config, n_iter)
ray.timeline("/mnt/c/Users/javig/timelines/time_taxi.json")

2020-11-19 21:01:12,454	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2020-11-19 21:01:14,973	INFO trainer.py:592 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
2020-11-19 21:01:14,974	INFO trainer.py:1064 -- `_use_trajectory_view_api` only supported for PyTorch so far! Will run w/o.
2020-11-19 21:01:14,975	INFO trainer.py:617 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=7592)[0m Instructions for updating:
[2m[36m(pid=7592)[0m non-resource variables are not supported in the long term
[2m[36m(pid=7596)[0m Instructions for updating:
[2m[36m(pid=7596)[0m non-resource variables are not supported in the long term


Instructions for updating:
Prefer Variable.assign which has equivalent behavior in 2.X.


[2m[36m(pid=7596)[0m Instructions for updating:
[2m[36m(pid=7596)[0m Prefer Variable.assign which has equivalent behavior in 2.X.
[2m[36m(pid=7592)[0m Instructions for updating:
[2m[36m(pid=7592)[0m Prefer Variable.assign which has equivalent behavior in 2.X.


  1 reward -884.00/-810.65/-677.00 len 200.00 learn_time(ms) 41042.34 saved /tmp/ppo/taxi_5/checkpoint_1/checkpoint-1
  2 reward -884.00/-785.00/-677.00 len 200.00 learn_time(ms) 41152.00 saved /tmp/ppo/taxi_5/checkpoint_2/checkpoint-2
  3 reward -893.00/-753.02/-250.00 len 194.87 learn_time(ms) 41330.16 saved /tmp/ppo/taxi_5/checkpoint_3/checkpoint-3
  4 reward -893.00/-747.06/-250.00 len 196.15 learn_time(ms) 41324.88 saved /tmp/ppo/taxi_5/checkpoint_4/checkpoint-4
  5 reward -1019.00/-749.56/-250.00 len 196.90 learn_time(ms) 41342.14 saved /tmp/ppo/taxi_5/checkpoint_5/checkpoint-5
  6 reward -1019.00/-726.07/-250.00 len 196.90 learn_time(ms) 41491.87 saved /tmp/ppo/taxi_5/checkpoint_6/checkpoint-6
  7 reward -1019.00/-699.00/-91.00 len 193.83 learn_time(ms) 41522.85 saved /tmp/ppo/taxi_5/checkpoint_7/checkpoint-7
  8 reward -1019.00/-663.07/-63.00 len 191.38 learn_time(ms) 42810.95 saved /tmp/ppo/taxi_5/checkpoint_8/checkpoint-8
  9 reward -956.00/-636.42/-63.00 len 189.03 learn_tim

#### Modelo con tres capas internas con la estructura \[256, 512, 256\]

In [6]:
ray.shutdown()
ray.init(ignore_reinit_error=True, _metrics_export_port=63419)
checkpoint_root = '/tmp/ppo/taxi_6'
env = 'Taxi-v3'
config = ppo.DEFAULT_CONFIG.copy()
config['model']['fcnet_hiddens'] = [256,512,256]
n_iter = 30
full_train(checkpoint_root, env, config, n_iter)
ray.timeline("/mnt/c/Users/javig/timelines/time_taxi.json")

2020-11-19 21:26:51,364	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
[2m[36m(pid=8969)[0m Instructions for updating:
[2m[36m(pid=8969)[0m non-resource variables are not supported in the long term
[2m[36m(pid=8971)[0m Instructions for updating:
[2m[36m(pid=8971)[0m non-resource variables are not supported in the long term
[2m[36m(pid=8971)[0m Instructions for updating:
[2m[36m(pid=8971)[0m Prefer Variable.assign which has equivalent behavior in 2.X.
[2m[36m(pid=8969)[0m Instructions for updating:
[2m[36m(pid=8969)[0m Prefer Variable.assign which has equivalent behavior in 2.X.


  1 reward -857.00/-742.65/-82.00 len 192.00 learn_time(ms) 7505.29 saved /tmp/ppo/taxi_6/checkpoint_1/checkpoint-1
  2 reward -875.00/-730.33/-82.00 len 193.25 learn_time(ms) 7317.44 saved /tmp/ppo/taxi_6/checkpoint_2/checkpoint-2
  3 reward -875.00/-719.39/-82.00 len 192.89 learn_time(ms) 7240.03 saved /tmp/ppo/taxi_6/checkpoint_3/checkpoint-3
  4 reward -902.00/-726.38/-82.00 len 194.24 learn_time(ms) 7206.93 saved /tmp/ppo/taxi_6/checkpoint_4/checkpoint-4
  5 reward -902.00/-724.15/-262.00 len 196.45 learn_time(ms) 7180.29 saved /tmp/ppo/taxi_6/checkpoint_5/checkpoint-5
  6 reward -902.00/-693.88/-170.00 len 195.10 learn_time(ms) 7163.70 saved /tmp/ppo/taxi_6/checkpoint_6/checkpoint-6
  7 reward -902.00/-678.65/-137.00 len 194.36 learn_time(ms) 7158.40 saved /tmp/ppo/taxi_6/checkpoint_7/checkpoint-7
  8 reward -902.00/-654.40/-137.00 len 192.43 learn_time(ms) 7151.22 saved /tmp/ppo/taxi_6/checkpoint_8/checkpoint-8
  9 reward -902.00/-609.08/-101.00 len 187.19 learn_time(ms) 7145.72

#### Modelo con cuatro capas internas de 256 neuronas

In [9]:
ray.shutdown()
ray.init(ignore_reinit_error=True, _metrics_export_port=63419)
checkpoint_root = '/tmp/ppo/taxi_7'
env = 'Taxi-v3'
config = ppo.DEFAULT_CONFIG.copy()
config['model']['fcnet_hiddens'] = [256,256,256,256]
n_iter = 30
full_train(checkpoint_root, env, config, n_iter)
ray.timeline("/mnt/c/Users/javig/timelines/time_taxi.json")

2020-11-19 23:15:30,826	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
[2m[36m(pid=9544)[0m Instructions for updating:
[2m[36m(pid=9544)[0m non-resource variables are not supported in the long term
[2m[36m(pid=9547)[0m Instructions for updating:
[2m[36m(pid=9547)[0m non-resource variables are not supported in the long term
[2m[36m(pid=9547)[0m Instructions for updating:
[2m[36m(pid=9547)[0m Prefer Variable.assign which has equivalent behavior in 2.X.
[2m[36m(pid=9544)[0m Instructions for updating:
[2m[36m(pid=9544)[0m Prefer Variable.assign which has equivalent behavior in 2.X.


  1 reward -893.00/-789.05/-713.00 len 200.00 learn_time(ms) 6924.62 saved /tmp/ppo/taxi_7/checkpoint_1/checkpoint-1
  2 reward -893.00/-779.83/-605.00 len 200.00 learn_time(ms) 6892.72 saved /tmp/ppo/taxi_7/checkpoint_2/checkpoint-2
  3 reward -920.00/-774.42/-605.00 len 199.22 learn_time(ms) 6738.23 saved /tmp/ppo/taxi_7/checkpoint_3/checkpoint-3
  4 reward -947.00/-770.16/-332.00 len 197.84 learn_time(ms) 6676.10 saved /tmp/ppo/taxi_7/checkpoint_4/checkpoint-4
  5 reward -974.00/-759.60/-111.00 len 196.05 learn_time(ms) 6728.55 saved /tmp/ppo/taxi_7/checkpoint_5/checkpoint-5
  6 reward -1010.00/-730.05/-111.00 len 191.73 learn_time(ms) 6731.40 saved /tmp/ppo/taxi_7/checkpoint_6/checkpoint-6
  7 reward -1010.00/-710.84/-111.00 len 192.20 learn_time(ms) 6756.06 saved /tmp/ppo/taxi_7/checkpoint_7/checkpoint-7
  8 reward -1010.00/-678.06/-111.00 len 188.94 learn_time(ms) 6691.53 saved /tmp/ppo/taxi_7/checkpoint_8/checkpoint-8
  9 reward -1010.00/-644.73/-111.00 len 187.83 learn_time(ms)

### Midiendo tiempos específicos en el rollout
Como hemos visto antes, el script de rollout no nos aporta información acerca de los tiempos específicos. Para ellos podemos acceder al código fuente y modificar esto manualmente.

Indicamos en el código donde hemos añadido el temporizador y guardamos en un array los distintos tiempos por episodio. Vamos a modificar el script `rollout.py` y ejecutarlo directamente con los parámetros deseados.



In [13]:
def rollout(agent,
            env_name,
            num_steps,
            num_episodes=0,
            saver=None,
            no_render=True,
            video_dir=None):
    policy_agent_mapping = default_policy_agent_mapping

    if saver is None:
        saver = RolloutSaver()

    if hasattr(agent, "workers") and isinstance(agent.workers, WorkerSet):
        env = agent.workers.local_worker().env
        multiagent = isinstance(env, MultiAgentEnv)
        if agent.workers.local_worker().multiagent:
            policy_agent_mapping = agent.config["multiagent"][
                "policy_mapping_fn"]

        policy_map = agent.workers.local_worker().policy_map
        state_init = {p: m.get_initial_state() for p, m in policy_map.items()}
        use_lstm = {p: len(s) > 0 for p, s in state_init.items()}
    else:
        env = gym.make(env_name)
        multiagent = False
        try:
            policy_map = {DEFAULT_POLICY_ID: agent.policy}
        except AttributeError:
            raise AttributeError(
                "Agent ({}) does not have a `policy` property! This is needed "
                "for performing (trained) agent rollouts.".format(agent))
        use_lstm = {DEFAULT_POLICY_ID: False}

    action_init = {
        p: flatten_to_single_ndarray(m.action_space.sample())
        for p, m in policy_map.items()
    }

    # If monitoring has been requested, manually wrap our environment with a
    # gym monitor, which is set to record every episode.
    if video_dir:
        env = gym_wrappers.Monitor(
            env=env,
            directory=video_dir,
            video_callable=lambda x: True,
            force=True)

    steps = 0
    episodes = 0
    times = []
    while keep_going(steps, num_steps, episodes, num_episodes): # Este bucle es para el número de episodio o pasos
        mapping_cache = {}  # in case policy_agent_mapping is stochastic
        saver.begin_rollout()
        obs = env.reset()
        agent_states = DefaultMapping(
            lambda agent_id: state_init[mapping_cache[agent_id]])
        prev_actions = DefaultMapping(
            lambda agent_id: action_init[mapping_cache[agent_id]])
        prev_rewards = collections.defaultdict(lambda: 0.)
        done = False
        reward_total = 0.0
        episode_time = 0.0
        while not done and keep_going(steps, num_steps, episodes,
                                      num_episodes):  #este bucle es para cada episodio
            multi_obs = obs if multiagent else {_DUMMY_AGENT_ID: obs}
            action_dict = {}
            for agent_id, a_obs in multi_obs.items(): 
                if a_obs is not None:
                    policy_id = mapping_cache.setdefault(
                        agent_id, policy_agent_mapping(agent_id))
                    p_use_lstm = use_lstm[policy_id]
                    # Aquí es donde empezamos a aplicar el modelo para ver que acción tomar.
                    
                    ################
                    t0 = time.time()
                    ################
                    
                    if p_use_lstm:
                        a_action, p_state, _ = agent.compute_action(
                            a_obs,
                            state=agent_states[agent_id],
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        agent_states[agent_id] = p_state
                    else:
                        a_action = agent.compute_action(
                            a_obs,
                            prev_action=prev_actions[agent_id],
                            prev_reward=prev_rewards[agent_id],
                            policy_id=policy_id)
                        
                    ########################
                    t1 = time.time()
                    episode_time += (t1-t0)
                    ########################
                    
                    a_action = flatten_to_single_ndarray(a_action)
                    action_dict[agent_id] = a_action
                    prev_actions[agent_id] = a_action
            action = action_dict

            action = action if multiagent else action[_DUMMY_AGENT_ID]
            next_obs, reward, done, info = env.step(action)
            if multiagent:
                for agent_id, r in reward.items():
                    prev_rewards[agent_id] = r
            else:
                prev_rewards[_DUMMY_AGENT_ID] = reward

            if multiagent:
                done = done["__all__"]
                reward_total += sum(reward.values())
            else:
                reward_total += reward
            if not no_render:
                env.render()
            saver.append_step(obs, action, next_obs, reward, done, info)
            steps += 1
            obs = next_obs
        saver.end_rollout()
        print("Episode #{}: reward: {}".format(episodes, reward_total))
        
        ####################################################################
        print("Episode #{}: model_time: {}".format(episodes, episode_time))
        times.append[episode_time]
        ####################################################################
        
        if done:
            episodes += 1
            
    ########################
    print("Episodes times:")
    print(times)
    ########################

In [None]:
ray.shutdown()
t0=time.time()
!python3 rollout.py /tmp/ppo/taxi/checkpoint_30/checkpoint-30 --env=Taxi-v3 --run PPO --episodes 10 --out='taxi_time.pkl' --save-info --use-shelve
t1 = time.time()-t0
print("Rollout total time: " + str(t1))

### Juegos basados en imágenes: categoría Atari

Estos juegos aprenden a partir de imágenes de videojuegos. Hemos elgido como ejemplo el entorno `Pong-v0`.
La red neuronal que se crea es una VisionNet de TF, definida para RLLIB y que cuenta con tantas como filtros de convolución se especifiquen en la configuración.

Las observaciones tienen un tamaño de (210,160,3), que se preprocesa para convertirla en una de la forma (84,84,4), dimesiones que sí son compatibles con los modelos preconfigurados de tensorflow. 

In [27]:
ray.shutdown()
ray.init()
config = ppo.DEFAULT_CONFIG.copy()
agent = ppo.PPOTrainer(config, env='Pong-v0')
policy=agent.get_policy()
print(policy.model.model_config)
print(policy.model.base_model.summary())

2020-11-20 01:59:16,010	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
[2m[36m(pid=11851)[0m Instructions for updating:
[2m[36m(pid=11851)[0m non-resource variables are not supported in the long term
[2m[36m(pid=11853)[0m Instructions for updating:
[2m[36m(pid=11853)[0m non-resource variables are not supported in the long term


{'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': [[16, [8, 8], 4], [32, [4, 4], 2], [256, [11, 11], 1]], 'conv_activation': 'relu', 'free_log_std': False, 'no_final_linear': False, 'vf_share_layers': False, 'use_lstm': False, 'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action_reward': False, '_time_major': False, 'framestack': True, 'dim': 84, 'grayscale': False, 'zero_mean': True, 'custom_model': None, 'custom_model_config': {}, 'custom_action_dist': None, 'custom_preprocessor': None}
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observations (InputLayer)       [(None, 84, 84, 4)]  0                                            
__________________________________________________________________________________________________
conv_value_1 (Conv2D)           (None, 21, 21, 16)   

Inspeccionando el código de `visionnet.py` encontramos que se nos crea, por defecto, una red con tres capas de convolución con los parámetros especificados.
- Tamaño del espacio de salida (número de filtros en la convolución)
- Tamaño de la ventana de convolución
- Strides: desplazamiento de la ventana por ancho y por largo

Ahora ejecutamos un entrenamiento de 5 episodios (tarda casi una hora en completarse) con el entorno `Pong-v0`

In [30]:
ray.shutdown()
ray.init(ignore_reinit_error=True, _metrics_export_port=63419)
checkpoint_root = '/tmp/ppo/pong'
env = 'Pong-v0'
config = ppo.DEFAULT_CONFIG.copy()
n_iter = 5
full_train(checkpoint_root, env, config, n_iter)
#ray.timeline("/mnt/c/Users/javig/timelines/time_taxi.json")

2020-11-19 00:25:17,174	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
[2m[36m(pid=3859)[0m Instructions for updating:
[2m[36m(pid=3859)[0m non-resource variables are not supported in the long term
[2m[36m(pid=3857)[0m Instructions for updating:
[2m[36m(pid=3857)[0m non-resource variables are not supported in the long term
2020-11-19 00:25:30,829	INFO trainable.py:252 -- Trainable.setup took 10.082 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.
[2m[36m(pid=3857)[0m Instructions for updating:
[2m[36m(pid=3857)[0m Prefer Variable.assign which has equivalent behavior in 2.X.
[2m[36m(pid=3859)[0m Instructions for updating:
[2m[36m(pid=3859)[0m Prefer Variable.assign which has equivalent behavior in 2.X.


  1 reward    nan/   nan/   nan len    nan learn_time(ms) 453236.95 saved /tmp/ppo/pong/checkpoint_1/checkpoint-1
  2 reward -21.00/-21.00/-21.00 len 1021.25 learn_time(ms) 444702.60 saved /tmp/ppo/pong/checkpoint_2/checkpoint-2
  3 reward -21.00/-21.00/-21.00 len 1021.50 learn_time(ms) 441895.95 saved /tmp/ppo/pong/checkpoint_3/checkpoint-3
  4 reward -21.00/-21.00/-21.00 len 1024.75 learn_time(ms) 437422.36 saved /tmp/ppo/pong/checkpoint_4/checkpoint-4
  5 reward -21.00/-21.00/-21.00 len 1023.06 learn_time(ms) 442213.97 saved /tmp/ppo/pong/checkpoint_5/checkpoint-5
Total learn time: 2219471.838
Average learn time per iteration: 73982.3946
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observations (InputLayer)       [(None, 84, 84, 4)]  0                                            
________________________________

Ejecutamos rollout con el comando de consola, sin poder medir los tiempos

In [2]:
ray.shutdown()
t0=time.time()
!rllib rollout /tmp/ppo/pong/checkpoint_5/checkpoint-5 --env=AirRaid-v0 --run PPO --episodes=10 --out='airRaid.pkl' --save-info --use-shelve --no-render
t1 = time.time()-t0
print("Rollout time: " + str(t1))

Instructions for updating:
non-resource variables are not supported in the long term
2020-12-02 00:50:15,913	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2020-12-02 00:50:18,577	INFO trainer.py:592 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
2020-12-02 00:50:18,577	INFO trainer.py:1064 -- `_use_trajectory_view_api` only supported for PyTorch so far! Will run w/o.
2020-12-02 00:50:18,577	INFO trainer.py:617 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=1702)[0m Instructions for updating:
[2m[36m(pid=1702)[0m non-resource variables are not supported in the long term
[2m[36m(pid=1706)[0m Instructions for updating:
[2m[36m(pid=1706)[0m non-resource variables are not supported in the long term
2020-12-02 00:50:26,754	INFO trainable.py:481 -- Restored on 10.10.1.128 from checkpoint: /tmp/ppo/pong/checkpoint_5/checkp

In [None]:
with shelve.open('airRaid.pkl') as rollouts:
    for episode_index in range(rollouts["num_episodes"]):
        rollout = rollouts[str(episode_index)]
        print(str(episode_index))
        pprint.pprint(rollout)

In [5]:
ray.shutdown()
t0=time.time()
!python3 rollout.py /tmp/ppo/pong/checkpoint_5/checkpoint-5 --env='AirRaid-v0' --run PPO --episodes 10 --out='airRaid_time.pkl' --save-info --use-shelve
t1 = time.time()-t0
print("Rollout total time: " + str(t1))

Instructions for updating:
non-resource variables are not supported in the long term
2020-12-02 01:33:44,618	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2020-12-02 01:33:47,311	INFO trainer.py:592 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
2020-12-02 01:33:47,311	INFO trainer.py:1064 -- `_use_trajectory_view_api` only supported for PyTorch so far! Will run w/o.
2020-12-02 01:33:47,311	INFO trainer.py:617 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=2246)[0m Instructions for updating:
[2m[36m(pid=2246)[0m non-resource variables are not supported in the long term
[2m[36m(pid=2249)[0m Instructions for updating:
[2m[36m(pid=2249)[0m non-resource variables are not supported in the long term
2020-12-02 01:33:55,462	INFO trainable.py:481 -- Restored on 10.10.1.128 from checkpoint: /tmp/ppo/pong/checkpoint_5/checkp

In [10]:
ray.shutdown()
t0=time.time()
!python3 rollout.py /tmp/ppo/pong/checkpoint_5/checkpoint-5 --env='Bowling-v0' --run PPO --episodes 10 --out='bowling.pkl' --save-info --use-shelve
t1 = time.time()-t0
print("Rollout total time: " + str(t1))

Instructions for updating:
non-resource variables are not supported in the long term
2020-11-20 10:45:41,326	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2020-11-20 10:45:43,955	INFO trainer.py:592 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
2020-11-20 10:45:43,955	INFO trainer.py:1064 -- `_use_trajectory_view_api` only supported for PyTorch so far! Will run w/o.
2020-11-20 10:45:43,955	INFO trainer.py:617 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=1168)[0m Instructions for updating:
[2m[36m(pid=1168)[0m non-resource variables are not supported in the long term
[2m[36m(pid=1171)[0m Instructions for updating:
[2m[36m(pid=1171)[0m non-resource variables are not supported in the long term
2020-11-20 10:45:51,945	INFO trainable.py:481 -- Restored on 10.10.1.128 from checkpoint: /tmp/ppo/pong/checkpoint_5/checkp

In [7]:
ray.shutdown()
t0=time.time()
!python3 rollout.py /tmp/ppo/pong/checkpoint_5/checkpoint-5 --env='Carnival-v0' --run PPO --episodes 10 --out='carnival.pkl' --save-info --use-shelve
t1 = time.time()-t0
print("Rollout total time: " + str(t1))

Instructions for updating:
non-resource variables are not supported in the long term
2020-12-02 00:39:39,824	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2020-12-02 00:39:42,521	INFO trainer.py:592 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
2020-12-02 00:39:42,521	INFO trainer.py:1064 -- `_use_trajectory_view_api` only supported for PyTorch so far! Will run w/o.
2020-12-02 00:39:42,521	INFO trainer.py:617 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=602)[0m Instructions for updating:
[2m[36m(pid=602)[0m non-resource variables are not supported in the long term
[2m[36m(pid=600)[0m Instructions for updating:
[2m[36m(pid=600)[0m non-resource variables are not supported in the long term
2020-12-02 00:39:50,372	INFO trainable.py:481 -- Restored on 10.10.1.128 from checkpoint: /tmp/ppo/pong/checkpoint_5/checkpoint

In [12]:
ray.shutdown()
t0=time.time()
!python3 rollout.py /tmp/ppo/pong/checkpoint_5/checkpoint-5 --env='DemonAttack-v0' --run PPO --episodes 10 --out='demonattack.pkl' --save-info --use-shelve
t1 = time.time()-t0
print("Rollout total time: " + str(t1))

Instructions for updating:
non-resource variables are not supported in the long term
2020-11-20 10:59:10,489	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2020-11-20 10:59:12,753	INFO trainer.py:592 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
2020-11-20 10:59:12,753	INFO trainer.py:1064 -- `_use_trajectory_view_api` only supported for PyTorch so far! Will run w/o.
2020-11-20 10:59:12,753	INFO trainer.py:617 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=1706)[0m Instructions for updating:
[2m[36m(pid=1706)[0m non-resource variables are not supported in the long term
[2m[36m(pid=1704)[0m Instructions for updating:
[2m[36m(pid=1704)[0m non-resource variables are not supported in the long term
2020-11-20 10:59:20,771	INFO trainable.py:481 -- Restored on 10.10.1.128 from checkpoint: /tmp/ppo/pong/checkpoint_5/checkp

In [17]:
ray.shutdown()
t0=time.time()
!python3 rollout.py /tmp/ppo/pong/checkpoint_5/checkpoint-5 --env='Pong-v0' --run PPO --episodes 10 --out='pong.pkl' --save-info --use-shelve
t1 = time.time()-t0
print("Rollout total time: " + str(t1))

  File "rollout.py", line 466
    average_time = episode_time/steps
                                    ^
TabError: inconsistent use of tabs and spaces in indentation
Rollout total time: 0.46683597564697266


In [29]:
ray.shutdown()
ray.init()
config = ppo.DEFAULT_CONFIG.copy()
agent = ppo.PPOTrainer(config, env='Pong-v0')

2020-11-20 12:08:11,766	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
[2m[36m(pid=3812)[0m Instructions for updating:
[2m[36m(pid=3812)[0m non-resource variables are not supported in the long term
[2m[36m(pid=3815)[0m Instructions for updating:
[2m[36m(pid=3815)[0m non-resource variables are not supported in the long term


In [27]:
agent.get_policy()

<ray.rllib.policy.tf_policy_template.PPOTFPolicy at 0x7f426404c430>

In [28]:
policy=agent.get_policy()
print(policy.model.model_config)
print(policy.model.base_model.summary())
print(dir(policy.model))
print(policy.model.inference_view_requirements)

{'fcnet_hiddens': [256, 256], 'fcnet_activation': 'tanh', 'conv_filters': None, 'conv_activation': 'relu', 'free_log_std': False, 'no_final_linear': False, 'vf_share_layers': False, 'use_lstm': False, 'max_seq_len': 20, 'lstm_cell_size': 256, 'lstm_use_prev_action_reward': False, '_time_major': False, 'framestack': True, 'dim': 84, 'grayscale': False, 'zero_mean': True, 'custom_model': None, 'custom_model_config': {}, 'custom_action_dist': None, 'custom_preprocessor': None}
Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observations (InputLayer)       [(None, 128)]        0                                            
__________________________________________________________________________________________________
fc_1 (Dense)                    (None, 256)          33024       observations[0][0]               
____

In [4]:
env = gym.make("CarRacing-v0")
print(env.reset().shape)
env.close()



Track generation: 1255..1573 -> 318-tiles track
(96, 96, 3)


In [46]:
from ray.rllib.models.preprocessors import get_preprocessor
prep = get_preprocessor(env.observation_space)(env.observation_space)
prep

<ray.rllib.models.preprocessors.GenericPixelPreprocessor at 0x7f41bfee7fd0>

In [47]:
prep.transform(env.reset()).shape

(84, 84, 3)

In [8]:
env = gym.make("Pong-v0")
print(type(env.reset()))

<class 'numpy.ndarray'>


In [None]:
print(env.reset()[25:-25, :, :])
print(env.reset())

In [6]:
import gym
env = gym.make('CartPole-v0')
env.reset()
for _ in range(1000):
    env.render()
    env.step(env.action_space.sample()) # take a random action
env.close()

NameError: name 'base' is not defined

In [2]:
ray.shutdown()
ray.init(ignore_reinit_error=True, _metrics_export_port=63419)
#checkpoint_root = '/tmp/ppo/taxi_3'
env = 'Pong-v0'
config = ppo.DEFAULT_CONFIG.copy()
config['model']['dim'] = 168
config['model']['conv_filters'] = [[16, [16, 16], 8],[32, [8, 8], 2],[256, [11, 11], 1]]
agent = ppo.PPOTrainer(config, env=env)

2020-12-03 19:32:13,564	INFO services.py:1090 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m
2020-12-03 19:32:22,119	INFO trainer.py:592 -- Tip: set framework=tfe or the --eager flag to enable TensorFlow eager execution
2020-12-03 19:32:22,121	INFO trainer.py:1064 -- `_use_trajectory_view_api` only supported for PyTorch so far! Will run w/o.
2020-12-03 19:32:22,122	INFO trainer.py:617 -- Current log_level is WARN. For more information, set 'log_level': 'INFO' / 'DEBUG' or use the -v and -vv flags.
[2m[36m(pid=275)[0m Instructions for updating:
[2m[36m(pid=275)[0m non-resource variables are not supported in the long term
[2m[36m(pid=276)[0m Instructions for updating:
[2m[36m(pid=276)[0m non-resource variables are not supported in the long term
2020-12-03 19:32:36,683	INFO trainable.py:252 -- Trainable.setup took 14.564 seconds. If your trainable is slow to initialize, consider setting reuse_actors=True to reduce actor creation overheads.


In [3]:
print(agent.get_policy().model.base_model.summary())

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observations (InputLayer)       [(None, 168, 168, 4) 0                                            
__________________________________________________________________________________________________
conv_value_1 (Conv2D)           (None, 21, 21, 16)   16400       observations[0][0]               
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 21, 21, 16)   16400       observations[0][0]               
__________________________________________________________________________________________________
conv_value_2 (Conv2D)           (None, 11, 11, 32)   32800       conv_value_1[0][0]               
_______________________________________________________________________________________

In [None]:
ray.shutdown()
ray.init(ignore_reinit_error=True, _metrics_export_port=63419)
#checkpoint_root = '/tmp/ppo/taxi_3'
env = 'Pong-v0'
config = ppo.DEFAULT_CONFIG.copy()
#config['model']['dim'] = 84
#config['preprocessor_pref'] = 'deepmind'
agent = ppo.PPOTrainer(config, env=env)

In [10]:
print(agent.get_policy().model.base_model.summary())

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
observations (InputLayer)       [(None, 168, 168, 4) 0                                            
__________________________________________________________________________________________________
conv_value_1 (Conv2D)           (None, 21, 21, 16)   16400       observations[0][0]               
__________________________________________________________________________________________________
conv1 (Conv2D)                  (None, 21, 21, 16)   16400       observations[0][0]               
__________________________________________________________________________________________________
conv_value_2 (Conv2D)           (None, 11, 11, 32)   32800       conv_value_1[0][0]               
_______________________________________________________________________________________

[2m[36m(pid=gcs_server)[0m F1203 23:49:26.063741  1432  1432 redis_client.cc:74]  Check failed: num_attempts < RayConfig::instance().redis_db_connect_retries() Expected 1 Redis shard addresses, found 2
[2m[36m(pid=gcs_server)[0m *** Check failure stack trace: ***
[2m[36m(pid=gcs_server)[0m     @     0x7fbb47f0b4cd  (unknown)
[2m[36m(pid=gcs_server)[0m     @     0x7fbb47f0c93c  (unknown)
[2m[36m(pid=gcs_server)[0m     @     0x7fbb47f0b1a9  (unknown)
[2m[36m(pid=gcs_server)[0m     @     0x7fbb47f0b3c1  (unknown)
[2m[36m(pid=gcs_server)[0m     @     0x7fbb47ecff39  (unknown)
[2m[36m(pid=gcs_server)[0m     @     0x7fbb47e00773  (unknown)
[2m[36m(pid=gcs_server)[0m     @     0x7fbb47e01291  (unknown)
[2m[36m(pid=gcs_server)[0m     @     0x7fbb47da00ba  (unknown)
[2m[36m(pid=gcs_server)[0m     @     0x7fbb47cb2da1  (unknown)
[2m[36m(pid=gcs_server)[0m     @     0x7fbb47c56a8c  (unknown)
[2m[36m(pid=gcs_server)[0m     @     0x7fbb47c201ac  (unknown)
[2m