# Estimate Q from trajectories

In [142]:
import numpy as np

from src.algorithm.backward_feature_selection import BackwardFeatureSelector
from src.algorithm.info_theory.entropy import NNEntropyEstimator
from src.wenvs import WrapperEnv
from src.algorithm.utils import episodes_with_len
from src.envs import lqgNdim
from src.policy_eval.fqi import Qfunction

In [143]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [144]:
Q = np.diag([0.9, 0.9, 0.1, 0.1])
R = Q.copy()
env = lqgNdim.LQG_nD(0.9, n_dim=4, Q=Q, R=R)
wenv = WrapperEnv(env, continuous_state=True, continuous_actions=True)

In [145]:
est = NNEntropyEstimator()

### Random Policy

In [154]:
np.random.seed(0)
wenv.seed(0)

k = 20
num_ep = 1000
trajectories = episodes_with_len(wenv, num_ep, 2*k, policy=None)

In [22]:
fs = BackwardFeatureSelector(est, trajectories)

In [23]:
selected = list(fs.try_remove_all(k, 0.9))
for S, err in selected:
    print(S, err)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

{0, 1, 2, 3, 4, 5, 7} 120.32321874996671
{0, 1, 3, 4, 5, 7} 180.49365452706158
{0, 1, 4, 5, 7} 244.17245423628611
{0, 1, 4, 5} 327.92554792107546
{0, 1, 5} 446.4391525163511
{0, 1} 625.3165041343639
{1} 1048.239650453394
set() 1058.1683111870345


In [155]:
Q = Qfunction(0.9).fit_fqi(trajectories)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

In [159]:
np.argsort(Q.regressor.feature_importances_)[::-1]

array([1, 0, 4, 5, 7, 6, 3, 2])

In [24]:
S, error = selected[3]
S = list(S)

In [30]:
Qhat = Qfunction(0.9).fit_fqi(trajectories, features_to_consider=S)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

In [32]:
mu = fs.t_step_data[:, :-1, 0]
Q_est = Q(mu)
Qhat_est = Qhat(mu[:, S])

np.linalg.norm(Q_est - Qhat_est, 2), error

(13.778537936580124, 327.92554792107546)

## Optimal Policy

In [156]:
np.random.seed(0)
wenv.seed(0)

k = 20
num_ep = 1000
trajectories = episodes_with_len(wenv, num_ep, 2*k, policy=env.optimalPolicy())

In [162]:
fs = BackwardFeatureSelector(est, trajectories)

In [163]:
selected = list(fs.try_remove_all(k, 0.9))
for S, err in selected:
    print(S, err)

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

HBox(children=(IntProgress(value=0, max=1), HTML(value='')))

{0, 1, 2, 3, 4, 5, 6} 9.523163071767467
{0, 1, 2, 3, 4, 5} 17.16530273064567
{0, 1, 2, 3, 4} 34.2971089410694
{0, 1, 2, 3} 45.65623477794367
{0, 1, 3} 108.49547680801483
{0, 1} 216.39803594801919
{1} 591.0926786678987
set() 206.60712383747108


In [160]:
Q = Qfunction(0.9).fit_fqi(trajectories)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

In [164]:
np.argsort(Q.regressor.feature_importances_)[::-1]

array([4, 1, 5, 0, 7, 6, 3, 2])

In [169]:
S, error = selected[5]
S = list(S)

In [172]:
Qhat = Qfunction(0.9).fit_fqi(trajectories, features_to_consider=S)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

In [173]:
mu = fs.t_step_data[:, :-1, 0]
Q_est = Q(mu)
Qhat_est = Qhat(mu[:, S])

np.linalg.norm(Q_est - Qhat_est, 2), error

(0.009736321973839382, 216.39803594801919)

#### With handmade S

In [174]:
S = [0,1,4,5]
error = fs.scoreSubset(k, 0.9, S)

Qhat = Qfunction(0.9).fit_fqi(trajectories, features_to_consider=S)

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

In [175]:
Q_est = Q(mu)
Qhat_est = Qhat(mu[:, S])

np.linalg.norm(Q_est - Qhat_est, 2), error

(0.009846665365945863, 206.03647901372764)

### Alternative with Optimal Policy

In [139]:
K, cov = env.computeOptimalK(), np.eye(4) * 0.001
Qalt = np.vectorize(lambda s, a: env.computeQFunction(s, a, K, cov, n_random_xn=100), 
                    signature='(k),(l)->()')

In [140]:
Qalt_est = Qalt(mu[:, :4], mu[:, 4:])

In [141]:
np.linalg.norm(Qalt_est - Qhat_est, 2), error

(14.956836072976332, 206.03647901372764)