In [1]:
import numpy as np
import gym
from timeit import time
from lightRaven.utils import generate_dataset, get_policy_perf
from lightRaven.sampling import IS, PDIS, WIS, CWPDIS
from lightRaven.policy import FAPolicy

In [2]:
env_id = 'CartPole-v0'
n_proc = 8
test_size = 100000
gamma = 1.0

In [3]:
env = gym.make(env_id)
dataset_test = generate_dataset(env_id, test_size, gamma=1.0, n_proc=n_proc)

In [4]:
obs_shape = env.observation_space.shape[0]
act_shape = env.action_space.n

In [5]:
sampler_test_IS = IS(dataset_test, gamma=1)
sampler_test_PDIS = PDIS(dataset_test, gamma=1)
sampler_test_WIS = WIS(dataset_test, gamma=1)
sampler_test_CWPDIS = CWPDIS(dataset_test, gamma=1)

In [6]:
# theta = np.array([-0.5645018, 1.67835788, -0.93696305, 5.42564286, 3.12075849,
#                   5.70131919, -0.60436665, 6.87296208])
theta = np.random.normal(0, 1, obs_shape * act_shape)
policy = FAPolicy(obs_shape, act_shape, theta)

In [7]:
sample_size = 500
perf = get_policy_perf(env_id, sample_size, policy, gamma, n_proc)
print(f"The real policy performance over {sample_size} trials is {perf.mean()}.")

The real policy performance over 500 trials is 30.832.


In [8]:
sampler_test_IS.load_eval_policy(policy)
sampler_test_PDIS.load_eval_policy(policy)
sampler_test_WIS.load_eval_policy(policy)
sampler_test_CWPDIS.load_eval_policy(policy)

In [9]:
start = time.time()
print(f"IS performance estimation: {sampler_test_IS.get_est().mean()}")
end = time.time()
print(f"{test_size} trajectories are done in {end-start:.2f} seconds!")

  traj_size = dataset_s[i].shape[0]


IS performance estimation: 25.736615883076045
1000000 trajectories are done in 2.26 seconds!


In [10]:
start = time.time()
print(f"PDIS performance estimation: {sampler_test_PDIS.get_est().mean()}")
end = time.time()
print(f"{test_size} trajectories are done in {end-start:.2f} seconds!")

PDIS performance estimation: 25.692445685554894
1000000 trajectories are done in 2.12 seconds!


In [11]:
start = time.time()
print(f"WIS performance estimation: {sampler_test_WIS.get_est(n_batch=1).mean()}")
end = time.time()
print(f"{test_size} trajectories are done in {end-start:.2f} seconds!")

WIS performance estimation: 25.743667185319403
1000000 trajectories are done in 3.62 seconds!


In [12]:
start = time.time()
print(f"CWPDIS performance estimation: {sampler_test_CWPDIS.get_est(n_batch=1).mean()}")
end = time.time()
print(f"{test_size} trajectories are done in {end-start:.2f} seconds!")

CWPDIS performance estimation: 24.2468449290958
1000000 trajectories are done in 52.89 seconds!


In [13]:
%timeit sampler_test_IS.get_est()

1.34 s ± 18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [14]:
%timeit sampler_test_PDIS.get_est()

1.63 s ± 34.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%timeit sampler_test_WIS.get_est(n_batch=30)

2.21 s ± 9.09 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
%timeit sampler_test_CWPDIS.get_est(n_batch=30)

41.2 s ± 312 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [17]:
from lightRaven.func import mcma_lb, mpeb_lb, t_lb

In [23]:
perf = np.sort(perf)
print(f"Min: {perf.min()}, Mean: {perf.mean()}, Max: {perf.max()}")
print(f"Q1: {perf[int(sample_size*0.25)]}, Medium: {perf[int(sample_size*0.5)]}, Q3: {perf[int(sample_size*0.75)]}")
print(f"The least 5% of the real performance: {perf[:int(sample_size*0.05)]}")

Min: 16.0, Mean: 30.832, Max: 79.0
Q1: 23.0, Medium: 28.0, Q3: 35.0
The least 5% of the real performance: [16. 17. 18. 18. 18. 19. 19. 19. 19. 19. 19. 19. 19. 19. 19. 20. 20. 20.
 20. 20. 20. 20. 20. 20. 20.]


In [19]:
print(f"IS Performance low bound: {t_lb(sampler_test_IS.get_est())}")
%timeit mpeb_lb(sampler_test_IS.get_est())

IS Performance low bound: 25.64972486394926
35.5 s ± 1.08 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
print(f"PDIS Performance low bound: {t_lb(sampler_test_PDIS.get_est())}")
%timeit t_lb(sampler_test_PDIS.get_est())

PDIS Performance low bound: 25.653616626384345
1.64 s ± 22.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
print(f"WIS Performance low bound: {t_lb(sampler_test_WIS.get_est(n_batch=30))}")
%timeit t_lb(sampler_test_WIS.get_est(n_batch=30))

WIS Performance low bound: 25.690460419529085
2.18 s ± 14.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
print(f"CWPDIS Performance low bound: {t_lb(sampler_test_CWPDIS.get_est(n_batch=30))}")
%timeit t_lb(sampler_test_CWPDIS.get_est(n_batch=30))

CWPDIS Performance low bound: 24.21615810976308
41 s ± 372 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
