In [1]:
import gymnasium as gym
import numpy as np
import os

import ray._private.utils

from ray.rllib.models.preprocessors import get_preprocessor
from ray.rllib.evaluation.sample_batch_builder import SampleBatchBuilder
from ray.rllib.offline.json_writer import JsonWriter

In [6]:
batch_builder = SampleBatchBuilder()  # or MultiAgentSampleBatchBuilder
writer = JsonWriter(
    os.path.join('/Users/jk1/temp/ope_tests/demo_out', "demo-out")
)

# You normally wouldn't want to manually create sample batches if a
# simulator is available, but let's do it anyways for example purposes:
env = gym.make("CartPole-v1")

# RLlib uses preprocessors to implement transforms such as one-hot encoding
# and flattening of tuple and dict observations. For CartPole a no-op
# preprocessor is used, but this may be relevant for more complex envs.
prep = get_preprocessor(env.observation_space)(env.observation_space)
print("The preprocessor is", prep)

for eps_id in range(1):
    obs, info = env.reset()
    prev_action = np.zeros_like(env.action_space.sample())
    prev_reward = 0
    terminated = truncated = False
    t = 0
    while not terminated and not truncated:
        action = env.action_space.sample()
        new_obs, rew, terminated, truncated, info = env.step(action)
        batch_builder.add_values(
            t=t,
            eps_id=eps_id,
            agent_index=0,
            obs=prep.transform(obs),
            actions=action,
            action_prob=1.0,  # put the true action probability here
            action_logp=0.0,
            rewards=rew,
            prev_actions=prev_action,
            prev_rewards=prev_reward,
            terminateds=terminated,
            truncateds=truncated,
            infos=info,
            new_obs=prep.transform(new_obs),
        )
        obs = new_obs
        prev_action = action
        prev_reward = rew
        t += 1
    # writer.write(batch_builder.build_and_reset()) 

The preprocessor is <ray.rllib.models.preprocessors.NoPreprocessor object at 0x7fcb346984c0>


  prep = get_preprocessor(env.observation_space)(env.observation_space)


In [10]:
batch_builder.buffers['obs'][1]

array([-0.00064289, -0.20507737,  0.0021346 ,  0.25802866], dtype=float32)

In [9]:
batch_builder.buffers['new_obs'][0]

array([-0.00064289, -0.20507737,  0.0021346 ,  0.25802866], dtype=float32)

In [11]:
env = gym.make("CartPole-v1")

In [12]:
env.action_space

Discrete(2)

In [13]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)