# Demonstration of MCE IRL code & environments

This is just tabular environments & vanilla MCE IRL.

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import copy

import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import scipy
import torch as th

import imitation.algorithms.tabular_irl as tirl
import imitation.envs.examples.model_envs as menv

sns.set(context='notebook')

np.random.seed(42)

# IRL on a random MDP

Testing both linear reward models & MLP reward models.

In [None]:
mdp = menv.RandomMDP(
    n_states=16,
    n_actions=3,
    branch_factor=2,
    horizon=10,
    random_obs=True,
    obs_dim=5,
    generator_seed=42)
V, Q, pi = tirl.mce_partition_fh(mdp)
Dt, D = tirl.mce_occupancy_measures(mdp, pi=pi)
demo_counts = D @ mdp.observation_matrix
obs_dim, = demo_counts.shape

In [None]:
rmodel = tirl.LinearRewardModel(obs_dim)
opt = th.optim.Adam(rmodel.parameters(), lr=0.1)
D_fake = tirl.mce_irl(mdp, opt, rmodel, D, linf_eps=1e-1)

In [None]:
rmodel = tirl.MLPRewardModel(obs_dim, [32, 32])
opt = th.optim.Adam(rmodel.parameters(), lr=0.1)
D_fake = tirl.mce_irl(mdp, opt, rmodel, D, linf_eps=1e-2)

# Same thing, but on grid world

The true reward here is not linear in the reduced feature space (i.e $(x,y)$ coordinates). Finding an appropriate linear reward is impossible (as I will demonstration), but an MLP should Just Work(tm).

In [None]:
# Same experiments, but on grid world
mdp = menv.CliffWorld(
    width=7,
    height=4,
    horizon=8,
    use_xy_obs=True)
V, Q, pi = tirl.mce_partition_fh(mdp)
Dt, D = tirl.mce_occupancy_measures(mdp, pi=pi)
demo_counts = D @ mdp.observation_matrix
obs_dim, = demo_counts.shape
rmodel = tirl.LinearRewardModel(obs_dim)
opt = th.optim.Adam(rmodel.parameters(), lr=1.0)
D_fake = tirl.mce_irl(mdp, opt, rmodel, D, linf_eps=0.1)

mdp.draw_value_vec(D)
plt.title("Cliff World $p(s)$")
plt.xlabel('x-coord')
plt.ylabel('y-coord')
plt.show()

mdp.draw_value_vec(D_fake)
plt.title("Occupancy for linear reward function")
plt.show()
plt.subplot(1, 2, 1)
mdp.draw_value_vec(rmodel(th.as_tensor(mdp.observation_matrix)).detach().numpy())
plt.title("Inferred reward")
plt.subplot(1, 2, 2)
mdp.draw_value_vec(mdp.reward_matrix)
plt.title("True reward")
plt.show()

In [None]:
rmodel = tirl.MLPRewardModel(obs_dim, [1024,], activation=th.nn.ReLU)
opt = th.optim.Adam(rmodel.parameters(), lr=1e-3)
D_fake_mlp = tirl.mce_irl(
    mdp, opt, rmodel, D, linf_eps=3e-2, print_interval=250)
mdp.draw_value_vec(D_fake_mlp)
plt.title("Occupancy for MLP reward function")
plt.show()
plt.subplot(1, 2, 1)
mdp.draw_value_vec(rmodel(th.as_tensor(mdp.observation_matrix)).detach().numpy())
plt.title("Inferred reward")
plt.subplot(1, 2, 2)
mdp.draw_value_vec(mdp.reward_matrix)
plt.title("True reward")
plt.show()

Notice that the inferred reward is absolutely nothing like the true reward, but the occupancy measure still (roughly) matches the true occupancy measure.