## Define ORCA

In [1]:
from crowd_sim.envs.policy.orca import ORCA
from crowd_sim.envs.utils.state import JointState

class Suicide(object):
    def __init__(self):
        pass
    
    
class ORCAPolicy(object):
    def __init__(self, suicide_if_stuck=False):
        self.simulator = ORCA()
        self.suicide_if_stuck = suicide_if_stuck

    def reset(self):
        self.simulator.reset()

    def predict(self, obs, env):
        self.simulator.time_step = env._get_dt()
        other_agent_states = [
            agent.get_observable_state() for agent in env.soadrl_sim.humans + env.soadrl_sim.other_robots]
        action = self.simulator.predict(
            JointState(env.soadrl_sim.robot.get_full_state(), other_agent_states),
            env.soadrl_sim.obstacle_vertices,
            env.soadrl_sim.robot,
        )
        if self.suicide_if_stuck:
            if action.v < 0.1:
                return Suicide()
        vx = action.v * np.cos(action.r)
        vy = action.v * np.sin(action.r)
        return np.array([vx, vy, 0.1*(np.random.random()-0.5)])

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


## Setup Env and Dummy Expert

In [10]:
from navrep.envs.navreptrainenv import NavRepTrainEnv
env = NavRepTrainEnv(silent=True, scenario='train', adaptive=False, collect_statistics=False)
env.soadrl_sim.human_num = 20

policy=ORCAPolicy(suicide_if_stuck=True)
def policy_wrapper(_obs):
    return policy.predict(_obs, env)

## Generate Data

In [2]:
from stable_baselines import DQN
from stable_baselines.gail import generate_expert_traj
from crowd_sim.envs.policy.orca import ORCA

In [11]:
#model = DQN('MlpPolicy', 'CartPole-v1', verbose=1)
      # Train a DQN agent for 1e5 timesteps and generate 10 trajectories
      # data will be saved in a numpy archive named `expert_cartpole.npz`
#generate_expert_traj(model, 'expert_cartpole', n_timesteps=int(1e5), n_episodes=10)

generate_expert_traj(policy_wrapper, 'orca_1', env, n_episodes=10)

AssertionError: Observation space type not supported

## Train Model

In [4]:
from stable_baselines import PPO2
from stable_baselines.gail import ExpertDataset
# Using only one expert trajectory
# you can specify `traj_limitation=-1` for using the whole dataset
dataset = ExpertDataset(expert_path='expert_cartpole.npz',
                        traj_limitation=1, batch_size=128)

model = PPO2('MlpPolicy', 'CartPole-v1', verbose=1)
# Pretrain the PPO2 model
model.pretrain(dataset, n_epochs=1000)

# As an option, you can train the RL agent
# model.learn(int(1e5))

# Test the pre-trained model
env = model.get_env()
obs = env.reset()

reward_sum = 0.0
for _ in range(1000):
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward
        env.render()
        if done:
                print(reward_sum)
                reward_sum = 0.0
                obs = env.reset()

env.close()

actions (5000, 1)
obs (5000, 4)
rewards (5000,)
episode_returns (10,)
episode_starts (5000,)
Total trajectories: 1
Total transitions: 998
Average returns: 500.0
Std for returns: 0.0
Creating environment from the given name, wrapped in a DummyVecEnv.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Pretraining with Behavior Cloning...
==== Training progress 10.00% ====
Epoch 100
Training loss: 0.591516, Validation loss: 0.577229

==== Training progress 20.00% ====
Epoch 200
Training loss: 0.566989, Validation loss: 0.551381

==== Training progress 30.00% ====
Epoch 300
Training loss: 0.528652, Validation loss: 0.519976

==== Training progress 40.00% ====
Epoch 400
Training loss: 0.500740, Validation loss: 0.481484

==== Training progress 50.00% ====
Epoch 500
Training loss: 0.462543, Validation loss: 0.430930

==== Training progress 60.00% ====
Epoch 600
Training loss: 0.423922, Validation loss: 0.422883

==== Training progress 70.00% ====
Epoch 700
Training

In [5]:
path = "/home/mads/navrep/datasets/V/navreptrain/099_scans_robotstates_actions_rewards_dones.npz"

In [6]:
import numpy as np

In [7]:
tmp = np.load("expert_cartpole.npz")
for k in tmp.keys():
    print(k)

actions
obs
rewards
episode_returns
episode_starts


In [8]:
tmp = np.load(path)
for k in tmp.keys():
    print(k)

scans
robotstates
actions
rewards
dones
