In [1]:
import os
import shutil
import time

from offline_dataset.dataset_creater import GymParallelSampler

from envs.env_creator import ibgym_env_creator, env_creator, IBGymModelQ_creator
from state_quantization.transforms import quantize_transform_creator
from q_learning.algorithm import QLPolicy
from ppo.policy import LSTMPPOPolicy

In [2]:
episodes = 1000
steps_per_episode = 1000
workers = 8

writer_path = os.path.join("tmp", "ibqf-out")
policy_save_path = 'tmp/q_learning/mb_q_policy_best_model_aeq-16bits_203871.pkl'

quant_model = 'model_h_c-20bits3'
model_path = f'tmp/state_quantization/{quant_model}'
q_transform_kwargs = {'device': 'cpu', 'keys': ['obs', 'new_obs'], 'reshape': (steps_per_episode, -1, 6),
                      'model_path': model_path}
use_policy = False

In [3]:
if os.path.exists(writer_path) and os.path.isdir(writer_path):
    shutil.rmtree(writer_path)

In [4]:


start = time.time()
if use_policy:
    policy = QLPolicy.load(policy_save_path)
    env_kwargs = {'steps_per_episode': steps_per_episode,'model_path':model_path}
    parallel_sampler = GymParallelSampler(env_creator=env_creator, path=writer_path, episodes=episodes,
                                      workers=workers, env_kwargs=env_kwargs, reward_threshold=None,
                                      policy=policy)
else:
    env_kwargs = {'steps_per_episode': steps_per_episode}
    parallel_sampler = GymParallelSampler(env_creator=ibgym_env_creator, path=writer_path, episodes=episodes,
                                      workers=workers, env_kwargs=env_kwargs, reward_threshold=None,
                                      buffer_transform=quantize_transform_creator,
                                      buffer_transform_kwargs=q_transform_kwargs,
                                      policy=None)



In [5]:
parallel_sampler.sample()
end = time.time()
print(end - start)

Episodes Sampled: 0
Episodes Sampled: 0
Episodes Sampled: 0
Episodes Sampled: 0
Episodes Sampled: 0
Episodes Sampled: 0
Episodes Sampled: 21
Episodes Sampled: 50
Episodes Sampled: 78
Episodes Sampled: 105
Episodes Sampled: 132
Episodes Sampled: 161
Episodes Sampled: 187
Episodes Sampled: 214
Episodes Sampled: 242
Episodes Sampled: 269
Episodes Sampled: 294
Episodes Sampled: 323
Episodes Sampled: 350
Episodes Sampled: 377
Episodes Sampled: 405
Episodes Sampled: 433
Episodes Sampled: 460
Episodes Sampled: 487
Episodes Sampled: 515
Episodes Sampled: 541
Episodes Sampled: 569
Episodes Sampled: 597
Episodes Sampled: 624
Episodes Sampled: 653
Episodes Sampled: 678
Episodes Sampled: 703
Episodes Sampled: 726
Episodes Sampled: 751
Episodes Sampled: 775
Episodes Sampled: 799
Episodes Sampled: 823
Episodes Sampled: 847
Episodes Sampled: 871
Episodes Sampled: 895
Episodes Sampled: 919
Episodes Sampled: 940
Episodes Sampled: 964


  logger.warn(
  logger.warn(
  logger.warn(


Episodes Sampled: 986


  logger.warn(
  logger.warn(
  logger.warn(
  logger.warn(


Episodes Sampled: 999


  logger.warn(


Episodes Sampled: 1000
Episodes Sampled: 1000
Sampling Finished
47.15958046913147


In [6]:
save_path = os.path.join("tmp", "offline_rl_trajectories",f"trajectory_ep{episodes}_{quant_model}.npy")
parallel_sampler.create_merged_dataset(save_path=save_path)

tmp/offline_rl_trajectories/trajectory_ep1000_model_h_c-20bits3.npy


{'obs': array([6.86331331e+11, 6.86331331e+11, 6.77741396e+11, ...,
        6.40135250e+10, 6.40135250e+10, 1.32733067e+11]),
 'actions': array([17, 16, 20, ...,  8, 24, 17]),
 'rewards': array([-197.52677917, -168.32380676, -177.52828979, ..., -515.60119629,
        -452.42816162, -392.27337646]),
 'dones': array([False, False, False, ..., False, False,  True]),
 'new_obs': array([6.86331331e+11, 6.77741396e+11, 6.86331331e+11, ...,
        6.40135250e+10, 1.32733067e+11, 1.37028035e+11]),
 'unroll_id': array([  0,   0,   0, ..., 124, 124, 124])}