# Learning a Reward Function using Preference Comparisons on Atari

In this case, we will use a convolutional neural network for our policy and reward model. We will also shape the learned reward model with the policy's learned value function, to prove that we can.

First, we will set up the environment, reward network, et cetera.

In [None]:
import gym
from gym.wrappers.atari_preprocessing import AtariPreprocessing
from gym.wrappers.frame_stack import FrameStack
import torch as th

from stable_baselines3 import PPO
from stable_baselines3.common.vec_env import DummyVecEnv
from stable_baselines3.ppo import CnnPolicy

from imitation.algorithms import preference_comparisons
from imitation.policies.base import NormalizeFeaturesExtractor
from imitation.rewards.reward_nets import CnnRewardNet, ChannelFirstRewardWrapper
from imitation.util.networks import RunningNorm

device = th.device("cuda" if th.cuda.is_available() else "cpu")

venv = DummyVecEnv(
    [lambda: FrameStack(AtariPreprocessing(gym.make("AsteroidsNoFrameskip-v4")), 4)]
)

base_reward_net = CnnRewardNet(
    venv.observation_space, venv.action_space, normalize_input_layer=RunningNorm
).to(device)
reward_net = ChannelFirstRewardWrapper(base_reward_net)

fragmenter = preference_comparisons.RandomFragmenter(warning_threshold=0, seed=0)
gatherer = preference_comparisons.SyntheticGatherer(seed=0)
reward_trainer = preference_comparisons.BasicRewardTrainer(
    model=reward_net, loss=preference_comparisons.CrossEntropyRewardLoss(), epochs=3
)

agent = PPO(
    policy=CnnPolicy,
    policy_kwargs=dict(
        features_extractor_class=NormalizeFeaturesExtractor,
        features_extractor_kwargs=dict(normalize_class=RunningNorm),
    ),
    env=venv,
    seed=0,
    n_steps=2048 // venv.num_envs,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0003,
    n_epochs=10,
)

trajectory_generator = preference_comparisons.AgentTrainer(
    algorithm=agent,
    reward_fn=reward_net,
    exploration_frac=0.0,
    seed=0,
)

pref_comparisons = preference_comparisons.PreferenceComparisons(
    trajectory_generator,
    reward_net,
    num_iterations=2,
    fragmenter=fragmenter,
    preference_gatherer=gatherer,
    reward_trainer=reward_trainer,
    fragment_length=50,
    transition_oversampling=1,
    initial_comparison_frac=0.1,
    allow_variable_horizon=True,
    seed=0,
    initial_epoch_multiplier=1,
)

We are now ready to train the reward model.

In [None]:
pref_comparisons.train(
    total_timesteps=1_000,
    total_comparisons=30,
)

We can now wrap the environment with the learned reward model, shaped by the policy's learned value function.

In [None]:
from imitation.rewards.reward_nets import ShapedRewardNet
from imitation.rewards.reward_wrapper import RewardVecEnvWrapper

shaped_reward_net = ChannelFirstRewardWrapper(
    ShapedRewardNet(
        base=base_reward_net,
        potential=agent.policy.predict_values,
        discount_factor=0.99,
    )
)
learned_reward_venv = RewardVecEnvWrapper(venv, shaped_reward_net.predict)

Next, we train an agent that sees only the shaped, learned reward.

In [None]:
learner = PPO(
    policy=CnnPolicy,
    env=learned_reward_venv,
    seed=0,
    batch_size=64,
    ent_coef=0.0,
    learning_rate=0.0003,
    n_epochs=10,
    n_steps=64,
)
learner.learn(1000)

We now evaluate the learner using the original reward.

In [None]:
from stable_baselines3.common.evaluation import evaluate_policy

reward, _ = evaluate_policy(learner.policy, venv, 10)
print(reward)