# Estimate Q from trajectories

In [2]:
import numpy as np

from src.algorithm.backward_feature_selection import BackwardFeatureSelector
from src.algorithm.info_theory.entropy import NNEntropyEstimator
from src.wenvs import WrapperEnv
from src.algorithm.utils import episodes_with_len
from src.envs import lqgNdim
from src.policy_eval.fqi import Qfunction



In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
Q = np.diag([0.9, 0.9, 0.1, 0.1])
R = Q.copy()
env = lqgNdim.LQG_nD(0.9, n_dim=4, Q=Q, R=R)
wenv = WrapperEnv(env, continuous_state=True, continuous_actions=True)

In [6]:
est = NNEntropyEstimator()

### Random Policy

In [7]:
np.random.seed(0)
wenv.seed(0)

k = 20
num_ep = 1000
trajectories = episodes_with_len(wenv, num_ep, k, policy=None)

In [8]:
fs = BackwardFeatureSelector(est, trajectories)

In [9]:
selected = list(fs.try_remove_all(k, 0.9))
for S, err in selected:
    print(S, err)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


{0, 1, 2, 3, 4, 5, 6} 105.79388085803775
{0, 1, 2, 3, 4, 5} 168.19914115888048
{0, 1, 2, 4, 5} 227.44638458072188
{0, 1, 2, 4} 304.99370026897356
{0, 2, 4} 414.97231998550046
{0, 2} 598.6249278526028
{0} 984.0678815526254
set() 990.0377751931628


In [10]:
Q = Qfunction(0.9).fit_fqi(trajectories)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [11]:
np.argsort(Q.regressor.feature_importances_)[::-1]

array([0, 1, 4, 5, 3, 6, 7, 2])

In [12]:
S, error = selected[3]
S = list(S)

In [13]:
Qhat = Qfunction(0.9).fit_fqi(trajectories, features_to_consider=S)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [14]:
mu = fs.t_step_data[:, :-1, 0]
Q_est = Q(mu)
Qhat_est = Qhat(mu[:, S])

np.linalg.norm(Q_est - Qhat_est, 2), error

(171.21959478636376, 304.99370026897356)

## Optimal Policy

In [15]:
np.random.seed(0)
wenv.seed(0)

k = 20
num_ep = 1000
trajectories = episodes_with_len(wenv, num_ep, k, policy=env.optimalPolicy())

In [16]:
fs = BackwardFeatureSelector(est, trajectories)

In [17]:
selected = list(fs.try_remove_all(k, 0.9))
for S, err in selected:
    print(S, err)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


{0, 1, 2, 3, 4, 5, 7} 12.371901498378747
{0, 1, 2, 3, 4, 5} 18.40528263404554
{0, 1, 2, 3, 4} 30.9793743206337
{0, 1, 2, 3} 40.85389118147725
{0, 1, 3} 99.09944349572892
{0, 1} 180.1017484541161
{0} 380.55570881108224
set() 397.86081495138296


In [18]:
Q = Qfunction(0.9).fit_fqi(trajectories)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [19]:
np.argsort(Q.regressor.feature_importances_)[::-1]

array([0, 5, 4, 1, 3, 2, 7, 6])

In [20]:
S, error = selected[5]
S = list(S)

In [21]:
Qhat = Qfunction(0.9).fit_fqi(trajectories, features_to_consider=S)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [24]:
mu = fs.t_step_data[:, :-1, 0]
Q_est = Q(mu)
Qhat_est = Qhat(mu[:, S])

np.linalg.norm(Q_est - Qhat_est, 2), error

(0.07491246977705439, 180.1017484541161)

#### With handmade S

In [25]:
S = [0,1,4,5]
error = fs.scoreSubset(k, 0.9, S)

Qhat = Qfunction(0.9).fit_fqi(trajectories, features_to_consider=S)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [26]:
Q_est = Q(mu)
Qhat_est = Qhat(mu[:, S])

np.linalg.norm(Q_est - Qhat_est, 2), error

(0.07618568275108706, 173.23255078942344)

### Alternative with Optimal Policy

In [27]:
K, cov = env.computeOptimalK(), np.eye(4) * 0.001
Qalt = np.vectorize(lambda s, a: env.computeQFunction(s, a, K, cov, n_random_xn=100), 
                    signature='(k),(l)->()')

In [28]:
Qalt_est = Qalt(mu[:, :4], mu[:, 4:])

In [29]:
np.linalg.norm(Qalt_est - Qhat_est, 2), error

(15.973973481344625, 173.23255078942344)