In [None]:
import gym
import pickle
import numpy as np

from lightRaven.utils import generate_dataset, safety_test
from lightRaven.policy import FixedPolicy, FAPolicy
from lightRaven.agent import CEMSeldonian
from lightRaven.sampling import PDIS, IS
from lightRaven.func import t_ub, hoeffding_ub, mpeb_ub, mcma_ub

from constraints import g0, g1, g2
from timeit import time

In [None]:
env_id = 'CartPole-v0'
n_proc = 8
train_size = 10000
test_size = 1000

In [None]:
env = gym.make(env_id)
dataset_train = generate_dataset(env_id, train_size, n_proc=n_proc)
dataset_test = generate_dataset(env_id, test_size, n_proc=n_proc)

In [None]:
obs_shape = env.observation_space.shape[0]
act_shape = env.action_space.n

In [None]:
sampler_train = PDIS(dataset_train, gamma=1, n_proc=n_proc)
sampler_test = PDIS(dataset_test, gamma=1, n_proc=n_proc)
ci = mcma_ub


In [None]:
g_funcs = [g0]
extra_std = 2.0
agent = CEMSeldonian(
    epochs=10, pop_size=30, elite_ratio=0.17,
    ci_ub=ci, ref_size=test_size, g_funcs=g_funcs, c=1,
    gamma=1, extra_std=2.0, extra_decay_time=10, n_proc=n_proc
)
agent.load_setup(obs_shape, act_shape, p_type='fa')
agent.load_sampler(sampler_train)
solution = None

for i in range(3):
    if i == 0:
        g_funcs = [g0]
        extra_std = 2.0
    if i == 1:
        g_funcs = [g1]
        extra_std = 0.5
    elif i == 2:
        g_funcs = [g2]
        extra_std = 0.1
    agent.g_funcs = g_funcs
    agent.extra_std = extra_std
    agent.init_params(theta = solution)

    start = time.time()
    agent.train()
    end = time.time()
    print(f"Finished training in {end-start:.2f} seconds.")

    thetas = agent.get_best_candidates()

    if len(thetas.shape) == 1:
        thetas = thetas.reshape(1,-1)

    solutions = []
    for theta in thetas:
        sol = safety_test(
            obs_shape=obs_shape, act_shape=act_shape, p_type='fa', theta=theta, sampler=sampler_test, ref_size=test_size,
            ci_ub=ci, g_funcs=g_funcs, delta=0.1
        )
        solutions.append(sol)

    solutions = sorted(solutions, key=lambda solution: solution[0], reverse=True)
    solution = solutions[0][1]
    print(solutions)

In [None]:
elites = []
for sol in solutions:
    if not isinstance(sol[1], str):
        elites.append(sol[1])
yield_ratio = len(elites) / len(solutions)
print(f"yield ratio: {yield_ratio}")


In [None]:
n_trials = 20
rewards = []
for theta in elites:
    theta_r = []
    for _ in range(n_trials):
        policy = FAPolicy(obs_shape, act_shape, theta)
        R = 0
        done = False
        s = env.reset()
        while not done:
            a = policy.act(s)
            s_prime, r, done, _ = env.step(a)
            R += r
            s = s_prime
        theta_r.append(R)
    theta_r = np.array(theta_r)
    rewards.append(theta_r)
rewards = np.array(rewards).ravel()
print(f"rewards: {rewards}")

In [None]:
perf_lb = 50
violations = rewards[rewards<perf_lb]
violation_prob = 0
if violations.size != 0:
    violation_prob = violations.size / rewards.size
if len(rewards) > 0:
    print(f"max: {rewards.max()}")
    print(f"min: {rewards.min()}")
    print(f"avg: {np.average(rewards)}")
    print(f"violation prob: {violation_prob}")

In [None]:
with open(f'dynamic', 'wb') as f:
    obj = [yield_ratio, rewards, violation_prob]
    pickle.dump(obj, f)