# Learning a Reward Function using Preference Comparisons on Atari

In this case, we will use a convolutional neural network for our policy and reward model. We will also shape the learned reward model with the policy's learned value function, to prove that we can. In the interests of execution time, we will only do a little bit of training - much less than in the previous preference comparison notebook.

First, we will set up the environment, reward network, et cetera.

In [8]:
import torch as th
import gym
from gym.wrappers import TimeLimit

from seals.util import AutoResetWrapper

from stable_baselines3 import PPO
from stable_baselines3.common.atari_wrappers import AtariWrapper
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.vec_env import DummyVecEnv, VecFrameStack
from stable_baselines3.ppo import CnnPolicy

from imitation.algorithms import preference_comparisons
from imitation.policies.base import NormalizeFeaturesExtractor
from imitation.rewards.reward_nets import CnnRewardNet
from imitation.util.networks import EMANorm, RunningNorm

device = th.device("cuda" if th.cuda.is_available() else "cpu")

# Here we ensure that our environment has constant-length episodes by resetting
# it when done, and running until 100 timesteps have elapsed.
def atari_const_len_wrapper(env):
    return TimeLimit(AutoResetWrapper(AtariWrapper(env)), max_episode_steps=100)


env = gym.make("AsteroidsNoFrameskip-v4")
venv = DummyVecEnv([lambda: atari_const_len_wrapper(env)])
venv = VecFrameStack(venv, n_stack=4)

reward_net = CnnRewardNet(
    venv.observation_space, venv.action_space, normalize_input_layer=EMANorm
).to(device)

fragmenter = preference_comparisons.RandomFragmenter(warning_threshold=0, seed=0)
gatherer = preference_comparisons.SyntheticGatherer(seed=0)
reward_trainer = preference_comparisons.BasicRewardTrainer(
    model=reward_net, loss=preference_comparisons.CrossEntropyRewardLoss(), epochs=3
)

agent = PPO(
    policy=CnnPolicy,
    policy_kwargs=dict(
        features_extractor_class=NormalizeFeaturesExtractor,
        features_extractor_kwargs=dict(normalize_class=RunningNorm),
    ),
    env=venv,
    seed=0,
    n_steps=16,
    batch_size=8,
    ent_coef=0.0,
    learning_rate=0.0003,
    n_epochs=10,
)

trajectory_generator = preference_comparisons.AgentTrainer(
    algorithm=agent,
    reward_fn=reward_net,
    venv=venv,
    exploration_frac=0.0,
    seed=0,
)

pref_comparisons = preference_comparisons.PreferenceComparisons(
    trajectory_generator,
    reward_net,
    num_iterations=2,
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    fragment_length=10,
    transition_oversampling=1,
    initial_comparison_frac=0.1,
    allow_variable_horizon=False,
    seed=0,
    initial_epoch_multiplier=1,
)

We are now ready to train the reward model.

In [9]:
pref_comparisons.train(
    total_timesteps=16,
    total_comparisons=15,
)

Query schedule: [1, 9, 5]
Collecting 2 fragments (20 transitions)
Requested 20 transitions but only 0 in buffer. Sampling 20 additional transitions.
Creating fragment pairs
Gathering preferences
Dataset now contains 1 comparisons


Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 8 timesteps
---------------------------------------------------
| raw/                                 |          |
|    agent/rollout/ep_rew_wrapped_mean | 0.0746   |
|    agent/time/fps                    | 122      |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 16       |
---------------------------------------------------
---------------------------------------------------
| mean/                                |          |
|    agent/rollout/ep_rew_wrapped_mean | 0.0746   |
|    agent/time/fps                    | 122      |
|    agent/time/iterations             | 1        |
|    agent/time/time_elapsed           | 0        |
|    agent/time/total_timesteps        | 16       |
|    agent/train/approx_kl             | 0.0107   |
|    agent/train/clip_fraction         | 0.0437   |
|    agent/train/clip_range            | 0.2      |
|    agent/train/entropy_loss    

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 8 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 92.9        |
|    agent/time/fps                    | 49          |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 32          |
|    agent/train/approx_kl             | 0.010703877 |
|    agent/train/clip_fraction         | 0.0437      |
|    agent/train/clip_range            | 0.2         |
|    agent/train/entropy_loss          | -2.64       |
|    agent/train/explained_variance    | 0.0882      |
|    agent/train/learning_rate         | 0.0003      |
|    agent/train/loss                  | 9.94        |
|    agent/train/n_updates             | 10          |
|    agent/train/policy_gradient_loss  | -0.0584     |
|    agent/train/value_loss            | 29          |
----------------------------------

Training reward model:   0%|          | 0/3 [00:00<?, ?it/s]

Training agent for 8 timesteps
------------------------------------------------------
| raw/                                 |             |
|    agent/rollout/ep_rew_wrapped_mean | 71.4        |
|    agent/time/fps                    | 64          |
|    agent/time/iterations             | 1           |
|    agent/time/time_elapsed           | 0           |
|    agent/time/total_timesteps        | 48          |
|    agent/train/approx_kl             | 0.022997785 |
|    agent/train/clip_fraction         | 0.244       |
|    agent/train/clip_range            | 0.2         |
|    agent/train/entropy_loss          | -2.64       |
|    agent/train/explained_variance    | -2.26       |
|    agent/train/learning_rate         | 0.0003      |
|    agent/train/loss                  | -0.123      |
|    agent/train/n_updates             | 20          |
|    agent/train/policy_gradient_loss  | -0.0821     |
|    agent/train/value_loss            | 0.215       |
----------------------------------

{'reward_loss': 0.8624255657196045, 'reward_accuracy': 0.4000000059604645}

We can now wrap the environment with the learned reward model, shaped by the policy's learned value function.

In [10]:
from imitation.rewards.reward_nets import ShapedRewardNet
from imitation.rewards.reward_wrapper import RewardVecEnvWrapper

shaped_reward_net = ShapedRewardNet(
    base=reward_net,
    potential=agent.policy.predict_values,
    discount_factor=0.99,
)
learned_reward_venv = RewardVecEnvWrapper(venv, shaped_reward_net.predict)

Next, we train an agent that sees only the shaped, learned reward.

In [11]:
learner = PPO(
    policy=CnnPolicy,
    env=learned_reward_venv,
    seed=0,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0003,
    n_epochs=10,
    n_steps=64,
)
learner.learn(1000)

<stable_baselines3.ppo.ppo.PPO at 0x7f2c1657b280>

We now evaluate the learner using the original reward.

In [5]:
from stable_baselines3.common.evaluation import evaluate_policy

reward, _ = evaluate_policy(learner.policy, venv, 10)
print(reward)

0.5
