## Define ORCA

In [1]:
from crowd_sim.envs.policy.orca import ORCA
from crowd_sim.envs.utils.state import JointState
import numpy as np

    
class ORCAPolicy(object):
    def __init__(self, suicide_if_stuck=False):
        self.simulator = ORCA()
        self.suicide_if_stuck = suicide_if_stuck

    def reset(self):
        self.simulator.reset()

    def predict(self, obs, env):
        self.simulator.time_step = env._get_dt()
        other_agent_states = [
            agent.get_observable_state() for agent in env.soadrl_sim.humans + env.soadrl_sim.other_robots]
        action = self.simulator.predict(
            JointState(env.soadrl_sim.robot.get_full_state(), other_agent_states),
            env.soadrl_sim.obstacle_vertices,
            env.soadrl_sim.robot,
        )
        vx = action.v * np.cos(action.r)
        vy = action.v * np.sin(action.r)
        return np.array([vx, vy])

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [17]:
# Test the pre-trained model
from navrep.envs.e2eenv import E2E1DNavRepEnv

def policy_wrapper(_obs):
    return policy.predict(_obs, env)

env = E2E1DNavRepEnv(silent=True, scenario='train', adaptive=False, collect_statistics=False)
env.soadrl_sim.human_num = 2
obs = env.reset()
model = policy_wrapper

reward_sum = 0.0
for _ in range(1000):
        action = model(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward
        env.render()
        if done:
                print(reward_sum)
                reward_sum = 0.0
                obs = env.reset()

env.close()

-32.52762800052541
0.2532474052331336
0.28262843141272137
-26.688682994243393
-26.756278006305692
0.6327754630259563
-28.732229134588874
-11.869005612283932
-27.776005176029052
0.1956912933290006


## Setup Env and Dummy Expert

In [2]:
from navrep.envs.e2eenv import E2E1DNavRepEnv
env = E2E1DNavRepEnv(silent=True, scenario='train', adaptive=False, collect_statistics=False)
env.soadrl_sim.human_num = 2

policy=ORCAPolicy(suicide_if_stuck=True)
def policy_wrapper(_obs):
    return policy.predict(_obs, env)

Ros was not found, disabled.


## Generate Data

In [3]:
from stable_baselines import DQN
#from stable_baselines.gail import generate_expert_traj
from crowd_sim.envs.policy.orca import ORCA

In [4]:
import os
import warnings
from typing import Dict

import cv2  # pytype:disable=import-error
import numpy as np
from gym import spaces

from stable_baselines.common.base_class import BaseRLModel
from stable_baselines.common.vec_env import VecEnv, VecFrameStack
from stable_baselines.common.base_class import _UnvecWrapper


def generate_expert_traj(model, save_path=None, env=None, n_timesteps=0,
                         n_episodes=100, image_folder='recorded_images'):
    """
    Train expert controller (if needed) and record expert trajectories.

    .. note::

        only Box and Discrete spaces are supported for now.

    :param model: (RL model or callable) The expert model, if it needs to be trained,
        then you need to pass ``n_timesteps > 0``.
    :param save_path: (str) Path without the extension where the expert dataset will be saved
        (ex: 'expert_cartpole' -> creates 'expert_cartpole.npz').
        If not specified, it will not save, and just return the generated expert trajectories.
        This parameter must be specified for image-based environments.
    :param env: (gym.Env) The environment, if not defined then it tries to use the model
        environment.
    :param n_timesteps: (int) Number of training timesteps
    :param n_episodes: (int) Number of trajectories (episodes) to record
    :param image_folder: (str) When using images, folder that will be used to record images.
    :return: (dict) the generated expert trajectories.
    """

    # Retrieve the environment using the RL model
    if env is None and isinstance(model, BaseRLModel):
        env = model.get_env()

    assert env is not None, "You must set the env in the model or pass it to the function."

    is_vec_env = False
    if isinstance(env, VecEnv) and not isinstance(env, _UnvecWrapper):
        is_vec_env = True
        if env.num_envs > 1:
            warnings.warn("You are using multiple envs, only the data from the first one will be recorded.")

    # Sanity check
    assert (isinstance(env.observation_space, spaces.Box) or
            isinstance(env.observation_space, spaces.Discrete)), "Observation space type not supported"

    assert (isinstance(env.action_space, spaces.Box) or
            isinstance(env.action_space, spaces.Discrete)), "Action space type not supported"

    # Check if we need to record images
    obs_space = env.observation_space
    record_images = len(obs_space.shape) == 3 and obs_space.shape[-1] in [1, 3, 4] \
                    and obs_space.dtype == np.uint8
    if record_images and save_path is None:
        warnings.warn("Observations are images but no save path was specified, so will save in numpy archive; "
                      "this can lead to higher memory usage.")
        record_images = False

    if not record_images and len(obs_space.shape) == 3 and obs_space.dtype == np.uint8:
        warnings.warn("The observations looks like images (shape = {}) "
                      "but the number of channel > 4, so it will be saved in the numpy archive "
                      "which can lead to high memory usage".format(obs_space.shape))

    image_ext = 'jpg'
    if record_images:
        # We save images as jpg or png, that have only 3/4 color channels
        if isinstance(env, VecFrameStack) and env.n_stack == 4:
            # assert env.n_stack < 5, "The current data recorder does no support"\
            #                          "VecFrameStack with n_stack > 4"
            image_ext = 'png'

        folder_path = os.path.dirname(save_path)
        image_folder = os.path.join(folder_path, image_folder)
        os.makedirs(image_folder, exist_ok=True)
        print("=" * 10)
        print("Images will be recorded to {}/".format(image_folder))
        print("Image shape: {}".format(obs_space.shape))
        print("=" * 10)

    if n_timesteps > 0 and isinstance(model, BaseRLModel):
        model.learn(n_timesteps)

    actions = []
    observations = []
    rewards = []
    episode_returns = np.zeros((n_episodes,))
    episode_starts = []

    ep_idx = 0
    obs = env.reset()
    episode_starts.append(True)
    reward_sum = 0.0
    idx = 0
    # state and mask for recurrent policies
    state, mask = None, None

    if is_vec_env:
        mask = [True for _ in range(env.num_envs)]

    while ep_idx < n_episodes:
        obs_ = obs[0] if is_vec_env else obs
        if record_images:
            image_path = os.path.join(image_folder, "{}.{}".format(idx, image_ext))
            # Convert from RGB to BGR
            # which is the format OpenCV expect
            if obs_.shape[-1] == 3:
                obs_ = cv2.cvtColor(obs_, cv2.COLOR_RGB2BGR)
            cv2.imwrite(image_path, obs_)
            observations.append(image_path)
        else:
            observations.append(obs_)

        if isinstance(model, BaseRLModel):
            action, state = model.predict(obs, state=state, mask=mask)
        else:
            action = model(obs)

        obs, reward, done, _ = env.step(action)

        # Use only first env
        if is_vec_env:
            mask = [done[0] for _ in range(env.num_envs)]
            action = np.array([action[0]])
            reward = np.array([reward[0]])
            done = np.array([done[0]])

        actions.append(action)
        rewards.append(reward)
        episode_starts.append(done)
        reward_sum += reward
        idx += 1
        if done:
            if not is_vec_env:
                obs = env.reset()
                # Reset the state in case of a recurrent policy
                state = None

            episode_returns[ep_idx] = reward_sum
            reward_sum = 0.0
            ep_idx += 1

    if isinstance(env.observation_space, spaces.Box) and not record_images:
        observations = np.concatenate(observations).reshape((-1,) + env.observation_space.shape)
    elif isinstance(env.observation_space, spaces.Discrete):
        observations = np.array(observations).reshape((-1, 1))
    elif record_images:
        observations = np.array(observations)

    if isinstance(env.action_space, spaces.Box):
        actions = np.concatenate(actions).reshape((-1,) + env.action_space.shape)
    elif isinstance(env.action_space, spaces.Discrete):
        actions = np.array(actions).reshape((-1, 1))

    rewards = np.array(rewards)
    episode_starts = np.array(episode_starts[:-1])

    assert len(observations) == len(actions)

    numpy_dict = {
        'actions': actions,
        'obs': observations,
        'rewards': rewards,
        'episode_returns': episode_returns,
        'episode_starts': episode_starts
    }  # type: Dict[str, np.ndarray]

    for key, val in numpy_dict.items():
        print(key, val.shape)

    if save_path is not None:
        np.savez(save_path, **numpy_dict)

    env.close()

    return numpy_dict

In [5]:
#model = DQN('MlpPolicy', 'CartPole-v1', verbose=1)
      # Train a DQN agent for 1e5 timesteps and generate 10 trajectories
      # data will be saved in a numpy archive named `expert_cartpole.npz`
#generate_expert_traj(model, 'expert_cartpole', n_timesteps=int(1e5), n_episodes=10)

generate_expert_traj(policy_wrapper, 'orca_1', env, n_episodes=1000)

actions (113362, 2)
obs (113362, 1085)
rewards (113362,)
episode_returns (1000,)
episode_starts (113362,)


{'actions': array([[ 0.17292904, -0.0522805 ],
        [ 0.24484049, -0.11186286],
        [ 0.38693309, -0.13376042],
        ...,
        [-0.20220875, -0.36130958],
        [-0.19412081, -0.34685699],
        [-0.18635637, -0.3329825 ]]),
 'obs': array([[ 3.20571566,  3.19619703,  3.18684173, ...,  0.        ,
          0.        ,  0.        ],
        [ 3.17651463,  3.16708255,  3.15781236, ...,  0.17292904,
         -0.0522805 ,  0.        ],
        [ 3.13906765,  3.12974691,  3.12058592, ...,  0.24484049,
         -0.11186286,  0.        ],
        ...,
        [ 4.56903791,  4.58403492,  4.59928703, ..., -0.2106337 ,
         -0.37636444,  0.        ],
        [ 4.56904173,  4.58403873,  4.59929085, ..., -0.20220875,
         -0.36130958,  0.        ],
        [ 4.56904507,  4.58404207,  4.59929419, ..., -0.19412081,
         -0.34685699,  0.        ]]),
 'rewards': array([ 2.50985813e-03,  4.21745056e-03,  5.87401559e-03, ...,
        -9.91719106e-01, -9.92050342e-01, -2.5000

In [14]:
tmp = np.load('orca_1.npz')
print(tmp['episode_returns'][tmp['episode_returns']>30])

[77.01932085 95.23103121 91.76252551]


In [None]:
print(env.observation_space.shape) 

## Generate Trajectory

## Train Model

In [10]:
from stable_baselines import PPO2
from stable_baselines.gail import ExpertDataset
from navrep.tools.custom_policy import Custom1DPolicy, ARCH, _C
# Using only one expert trajectory
# you can specify `traj_limitation=-1` for using the whole dataset
dataset = ExpertDataset(expert_path='orca_1.npz',
                        traj_limitation=1, batch_size=64)

model = PPO2('MlpPolicy', env, verbose=1)
# Pretrain the PPO2 model
model.pretrain(dataset, n_epochs=5000)

# As an option, you can train the RL agent
# model.learn(int(1e5))

# Test the pre-trained model
env = model.get_env()
obs = env.reset()

reward_sum = 0.0
for _ in range(1000):
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward
        env.render()
        if done:
                print(reward_sum)
                reward_sum = 0.0
                obs = env.reset()

env.close()

actions (113362, 2)
obs (113362, 1085)
rewards (113362,)
episode_returns (1000,)
episode_starts (113362,)
Total trajectories: 1
Total transitions: 126
Average returns: -19.149434741689326
Std for returns: 38.877416667286596
Pretraining with Behavior Cloning...
==== Training progress 10.00% ====
Epoch 500
Training loss: 0.000252, Validation loss: 0.008476

==== Training progress 20.00% ====
Epoch 1000
Training loss: 0.000036, Validation loss: 0.009115

==== Training progress 30.00% ====
Epoch 1500
Training loss: 0.000007, Validation loss: 0.008005

==== Training progress 40.00% ====
Epoch 2000
Training loss: 0.000004, Validation loss: 0.008114

==== Training progress 50.00% ====
Epoch 2500
Training loss: 0.000005, Validation loss: 0.008879

==== Training progress 60.00% ====
Epoch 3000
Training loss: 0.000022, Validation loss: 0.008303

==== Training progress 70.00% ====
Epoch 3500
Training loss: 0.000003, Validation loss: 0.006957

==== Training progress 80.00% ====
Epoch 4000
Training

ArgumentError: argument 2: <class 'TypeError'>: wrong type

## Stable Baselines Example

In [None]:
from stable_baselines import DQN
from stable_baselines.gail import generate_expert_traj

model = DQN('MlpPolicy', 'CartPole-v1', verbose=1)
      # Train a DQN agent for 1e5 timesteps and generate 10 trajectories
      # data will be saved in a numpy archive named `expert_cartpole.npz`
generate_expert_traj(model, 'expert_cartpole', n_timesteps=int(1e5), n_episodes=10)

In [None]:
from stable_baselines import PPO2
from stable_baselines.gail import ExpertDataset
# Using only one expert trajectory
# you can specify `traj_limitation=-1` for using the whole dataset
dataset = ExpertDataset(expert_path='expert_cartpole.npz',
                        traj_limitation=1, batch_size=128)

model = PPO2('MlpPolicy', 'CartPole-v1', verbose=1)
# Pretrain the PPO2 model
model.pretrain(dataset, n_epochs=1000)

# As an option, you can train the RL agent
# model.learn(int(1e5))

# Test the pre-trained model
env = model.get_env()
obs = env.reset()

reward_sum = 0.0
for _ in range(1000):
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward
        env.render()
        if done:
                print(reward_sum)
                reward_sum = 0.0
                obs = env.reset()

env.close()

In [None]:
np.ones(3)*np.inf