# Learning a Reward Function using Preference Comparisons

The preference comparisons algorithm learns a reward function by comparing trajectory segments to each other.

To set up the preference comparisons algorithm, we first need to set up a lot of its internals beforehand:

In [1]:
from imitation.algorithms import preference_comparisons
from imitation.rewards.reward_nets import BasicRewardNet
from imitation.util.networks import RunningNorm
from imitation.policies.base import FeedForward32Policy, NormalizeFeaturesExtractor
import gym
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3 import PPO

venv = DummyVecEnv([lambda: gym.make("seals/CartPole-v0")] * 8)

reward_net = BasicRewardNet(
    venv.observation_space, venv.action_space, normalize_input_layer=RunningNorm
)

fragmenter = preference_comparisons.RandomFragmenter(warning_threshold=0, seed=0)
gatherer = preference_comparisons.SyntheticGatherer(seed=0)
reward_trainer = preference_comparisons.CrossEntropyRewardTrainer(
    model=reward_net,
    epochs=3,
)

agent = PPO(
    policy=FeedForward32Policy,
    policy_kwargs=dict(
        features_extractor_class=NormalizeFeaturesExtractor,
        features_extractor_kwargs=dict(normalize_class=RunningNorm),
    ),
    env=venv,
    seed=0,
    n_steps=2048 // venv.num_envs,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0003,
    n_epochs=10,
)

trajectory_generator = preference_comparisons.AgentTrainer(
    algorithm=agent,
    reward_fn=reward_net,
    exploration_frac=0.0,
    seed=0,
)

pref_comparisons = preference_comparisons.PreferenceComparisons(
    trajectory_generator,
    reward_net,
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    comparisons_per_iteration=100,
    fragment_length=100,
    transition_oversampling=1,
    initial_comparison_frac=0.1,
    allow_variable_horizon=False,
    seed=0,
    initial_epoch_multiplier=2,  # Note: set to 200 to achieve sensible results
)

  logger.warn(


Then we can start training the reward model. Note that we need to specify the total timesteps that the agent should be trained and how many fragment comparisons should be made.

In [2]:
pref_comparisons.train(
    total_timesteps=1000,  # Note: set to 40000 to achieve sensible results
    total_comparisons=120,  # Note: set to 4000 to achieve sensible results
)

Collecting 24000 trajectory steps
Requested 24000 transitions but only 0 in buffer. Sampling 24000 additional transitions.
Creating fragment pairs
gathering preferences
Dataset now contains 120 samples
Training reward model


  state_th = th.as_tensor(state, device=self.device)


Training agent for 1000 timesteps
----------------------------------
| raw/                    |      |
|    agent/time/fps       | 208  |
|    agent/time/iterat... | 1    |
|    agent/time/time_e... | 9    |
|    agent/time/total_... | 2048 |
----------------------------------
--------------------------------------
| mean/                   |          |
|    agent/time/fps       | 208      |
|    agent/time/iterat... | 1        |
|    agent/time/time_e... | 9        |
|    agent/time/total_... | 2.05e+03 |
|    agent/train/appro... | 0.0112   |
|    agent/train/clip_... | 0.2      |
|    agent/train/entro... | -0.687   |
|    agent/train/expla... | 0.12     |
|    agent/train/learn... | 0.0003   |
|    agent/train/loss     | 0.0564   |
|    agent/train/n_upd... | 10       |
|    agent/train/polic... | -0.00932 |
|    agent/train/value... | 2.09     |
|    preferences/entropy  | 0.407    |
|    reward/accuracy      | 0.565    |
|    reward/loss          | 0.873    |
-------------------

{'reward_loss': 0.0, 'reward_accuracy': 0.0}

After we trained the reward network using the preference comparisons algorithm, we can wrap our environment with that learned reward.

In [3]:
from imitation.rewards.reward_wrapper import RewardVecEnvWrapper


learned_reward_venv = RewardVecEnvWrapper(venv, reward_net.predict)

Now we can train an agent, that only sees those learned reward.

In [4]:
from stable_baselines3 import PPO
from stable_baselines3.ppo import MlpPolicy

learner = PPO(
    policy=MlpPolicy,
    env=learned_reward_venv,
    seed=0,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0003,
    n_epochs=10,
    n_steps=64,
)
learner.learn(1000)  # Note: set to 100000 to train a proficient expert

<stable_baselines3.ppo.ppo.PPO at 0x7ff9bc36c1f0>

Then we can evaluate it using the original reward.

In [5]:
from stable_baselines3.common.evaluation import evaluate_policy

reward, _ = evaluate_policy(agent.policy, venv, 10)
print(reward)



8.6
