# Estimate Q from trajectories

In [3]:
import numpy as np

from src.algorithm.backward_feature_selection import BackwardFeatureSelector
from src.algorithm.info_theory.entropy import NNEntropyEstimator
from src.wenvs import WrapperEnv
from src.algorithm.utils import episodes_with_len
from src.envs import lqgNdim
from src.policy_eval.fqi import QfunctionFQI
from src.policy_eval.k_predictors import QfunctionAsSum

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
Q = np.diag([0.9, 0.9, 0.1, 0.1])
R = Q.copy()
env = lqgNdim.LQG_nD(0.9, n_dim=4, Q=Q, R=R)
wenv = WrapperEnv(env, continuous_state=True, continuous_actions=True)

In [6]:
est = NNEntropyEstimator()

### Random Policy

In [7]:
np.random.seed(0)
wenv.seed(0)

k = 20
num_ep = 1000
trajectories = episodes_with_len(wenv, num_ep, k, policy=None)

In [18]:
fs = BackwardFeatureSelector(est, trajectories, nproc=None)

In [19]:
selected = list(fs.try_remove_all(k, 0.9))
for S, err in selected:
    print(S, err)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=168), HTML(value='')))

HBox(children=(IntProgress(value=0, max=147), HTML(value='')))

HBox(children=(IntProgress(value=0, max=126), HTML(value='')))

HBox(children=(IntProgress(value=0, max=105), HTML(value='')))

HBox(children=(IntProgress(value=0, max=84), HTML(value='')))

HBox(children=(IntProgress(value=0, max=63), HTML(value='')))

HBox(children=(IntProgress(value=0, max=42), HTML(value='')))

HBox(children=(IntProgress(value=0, max=21), HTML(value='')))


{0, 1, 2, 3, 4, 5, 6} 110.31529451416485
{0, 1, 2, 4, 5, 6} 166.17446507527583
{0, 1, 2, 4, 5} 232.6004032540696
{0, 1, 2, 5} 305.93522730704854
{0, 1, 5} 413.6602346630509
{1, 5} 589.1983775927378
{1} 991.8712504167934
set() 990.8315709169458


In [13]:
Q = QfunctionFQI(0.9).fit(trajectories)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [14]:
Qalt = QfunctionAsSum(0.9).fit(trajectories)

In [45]:
mu = fs.t_step_data[:, :-1, 0]
Q_est = Q(mu)
Qalt_est = Qalt(mu)

np.linalg.norm(Q_est - Qalt_est, 2)

994.0067861539009

In [29]:
np.argsort(Q.regressor.feature_importances_)[::-1]

array([1, 0, 5, 4, 6, 7, 2, 3])

In [30]:
S, error = selected[3]
S = list(S)

In [31]:
Qhat = QfunctionFQI(0.9).fit(trajectories, features_to_consider=S)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [32]:
mu = fs.t_step_data[:, :-1, 0]
Q_est = Q(mu)
Qhat_est = Qhat(mu[:, S])

np.linalg.norm(Q_est - Qhat_est, 2), error

(162.77503841872942, 305.93522730704854)

In [34]:
Qalt_hat = QfunctionAsSum(0.9).fit(trajectories, features_to_consider=S)

In [46]:
mu = fs.t_step_data[:, :-1, 0]
Qalt_est = Qalt(mu)
Qalt_hat_est = Qalt_hat(mu[:, S])

np.linalg.norm(Qalt_est - Qalt_hat_est, 2), error

(2.0155737611836667e-05, 305.93522730704854)

## Optimal Policy

In [15]:
np.random.seed(0)
wenv.seed(0)

k = 20
num_ep = 1000
trajectories = episodes_with_len(wenv, num_ep, k, policy=env.optimalPolicy())

In [16]:
fs = BackwardFeatureSelector(est, trajectories)

In [17]:
selected = list(fs.try_remove_all(k, 0.9))
for S, err in selected:
    print(S, err)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


{0, 1, 2, 3, 4, 5, 7} 12.371901498378747
{0, 1, 2, 3, 4, 5} 18.40528263404554
{0, 1, 2, 3, 4} 30.9793743206337
{0, 1, 2, 3} 40.85389118147725
{0, 1, 3} 99.09944349572892
{0, 1} 180.1017484541161
{0} 380.55570881108224
set() 397.86081495138296


In [18]:
Q = QfunctionFQI(0.9).fit(trajectories)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [19]:
np.argsort(Q.regressor.feature_importances_)[::-1]

array([0, 5, 4, 1, 3, 2, 7, 6])

In [20]:
S, error = selected[5]
S = list(S)

In [21]:
Qhat = QfunctionFQI(0.9).fit(trajectories, features_to_consider=S)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [24]:
mu = fs.t_step_data[:, :-1, 0]
Q_est = Q(mu)
Qhat_est = Qhat(mu[:, S])

np.linalg.norm(Q_est - Qhat_est, 2), error

(0.07491246977705439, 180.1017484541161)

#### With handmade S

In [25]:
S = [0,1,4,5]
error = fs.scoreSubset(k, 0.9, S)

Qhat = QfunctionFQI(0.9).fit(trajectories, features_to_consider=S)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))




In [26]:
Q_est = Q(mu)
Qhat_est = Qhat(mu[:, S])

np.linalg.norm(Q_est - Qhat_est, 2), error

(0.07618568275108706, 173.23255078942344)

### Alternative with Optimal Policy

In [27]:
K, cov = env.computeOptimalK(), np.eye(4) * 0.001
Qalt = np.vectorize(lambda s, a: env.computeQFunction(s, a, K, cov, n_random_xn=100), 
                    signature='(k),(l)->()')

In [28]:
Qalt_est = Qalt(mu[:, :4], mu[:, 4:])

In [29]:
np.linalg.norm(Qalt_est - Qhat_est, 2), error

(15.973973481344625, 173.23255078942344)