In [1]:
from distributions.distribution_by_sequence import DistributionBySequence
from distributions.distribution import Distribution
from distributions.sequence import Sequence
from algorithms.semi_bandit_exp3 import SemiBanditExp3
from algorithms.full_bandit_exp3 import FullBanditExp3
from algorithms.semi_bandit_ftrl import SemiBanditFTRL
from algorithms.semi_bandit_ftrl_inv import SemiBanditFTRLInv
from algorithms.full_bandit_exp3_inv import FullBanditExp3Inv
from algorithms.full_bandit_exp3_inv_copy import FullBanditExp3InvOld
from algorithms.real_lin_exp3 import RealLinExp3
from algorithms.uniform_random import UniformRandom
from algorithms.non_contextual_exp3 import NonContextualExp3
from algorithms.one_per_context import OnePerContext
from experiment_manager.experiment_manager import ExperimentManager

from misc.tensor_helpers import *

from distributions.actionsets.msets import MSets

from distributions.contexts.binary_context import BinaryContext
from distributions.thetas.single_hole import SingleHole
from distributions.thetas.independent_bernoulli import IndependentBernoulli



In [2]:
def get_dist(rng, d, K, m):
    p = np.zeros((d, K)) + 0.5
    for i in range(d):
        placed_already = []
        while len(placed_already) < m:

            index = rng.integers(K)
            if index not in placed_already:
                placed_already.append(index)
                p[i, index] = 0.4
    
    return IndependentBernoulli(d, K, p)


In [3]:
rng = np.random.default_rng(0)
algo = FullBanditExp3Inv()
algo = SemiBanditFTRLInv()
#algo = SemiBanditExp3()

length = 10000
d = 10
number_of_ones = 3
K = 4
m  = 2
actionset = MSets(K, m)

epsilon = 0.25 * np.min([np.sqrt(K / length), 1])
p = np.zeros((d, K)) + 0.5
for i in range(d):
    p[i, 0] = 0.45

dist_lower_bound = Distribution(BinaryContext(d, number_of_ones), get_dist(rng, d, K, m), actionset)

seq = dist_lower_bound.generate(length, rng, rng)
algo.set_constants(rng, seq)
print("sigma:", seq.sigma, "m:", seq.m, "beta:", algo.beta, "gamma:", algo.gamma, "eta", algo.eta)
#algo.gamma = 0.06887868239885406
#algo.eta = 0.0010063304877787227
loss, _, _, _ = algo.run_on_sequence(rng, seq)
loss_of_optimal_policy, _, _ = seq.find_optimal_policy()
print("Main", loss - loss_of_optimal_policy, algo.gamma, algo.eta)

algo = UniformRandom()
algo.set_constants(rng, seq)
loss, _, _, _ = algo.run_on_sequence(rng, seq)
print("UniformRandom", loss - loss_of_optimal_policy, algo.gamma, algo.eta)

algo = RealLinExp3()
algo.set_constants(rng, seq)
algo.gamma = np.sqrt(1 / seq.length)
algo.eta = np.sqrt(np.log(len(seq.actionset.actionset)) / (seq.d * len(seq.actionset.actionset) * seq.length))
loss, _, _, _ = algo.run_on_sequence(rng, seq)
print("RealLinExp3", loss - loss_of_optimal_policy, algo.gamma, algo.eta)

algo = OnePerContext()
algo.set_constants(rng, seq)
loss, _, _, _ = algo.run_on_sequence(rng, seq)
context_algo = algo.context_algos[list(algo.context_algos.keys())[0]]
print("OnePerContext", loss - loss_of_optimal_policy, algo.gamma, algo.eta)

algo = FullBanditExp3Inv()
algo.set_constants(rng, seq)
loss, _, _, _ = algo.run_on_sequence(rng, seq)
print("FullBanditExp3Inv", loss - loss_of_optimal_policy, algo.gamma, algo.eta)

algo = FullBanditExp3InvOld()
algo.set_constants(rng, seq)
loss, _, _, _ = algo.run_on_sequence(rng, seq)
print("FullBanditExp3InvOld", loss - loss_of_optimal_policy, algo.gamma, algo.eta)


sigma: 1.7320508075688772 m: 2 beta: 0.16666666666666669 gamma: 0.2278969536259813 eta 0.00043879480795569324
Main 1491.0 0.2278969536259813 0.00043879480795569324
UniformRandom 1903.0 None None
RealLinExp3 1497.0 0.01 0.0017280815322335801
OnePerContext 1760.0 None None
FullBanditExp3Inv 1962.0 0.4729860366651096 0.00018940934090372338
FullBanditExp3InvOld 1928.0 0.4729860366651096 0.00018940934090372338


In [4]:
algo = SemiBanditFTRLInv()
algo.set_constants(rng, seq)

seq.reset()
context, _, _, done = seq.get_next(None)

losses = []
probability_array = []
action_array = []
while not done:
    probabilities = algo.get_policy(context)
    probability_array.append(probabilities)

    print("Start next")
    print(context, probabilities)
    action_index = rng.choice(np.arange(seq.actionset.number_of_actions), p=probabilities)
    print(action_index, seq.actionset[action_index])
    action_array.append(action_index)

    print("\n")
    next_context, loss, loss_vec, done = seq.get_next(seq.actionset[action_index])
    print(loss, loss_vec)
    loss_vec[~seq.actionset[action_index]] = 0
    print(loss, loss_vec)
    if algo.full_bandit:
        algo.observe_loss(loss, context, action_index)
    else:
        algo.observe_loss_vec(loss_vec, context, action_index)

    print(algo.theta_estimate)
    print("\n\n")
    losses.append(loss)
    context = next_context

    if seq.current_index == 100:
        1/0


Start next
[1. 0. 0. 0. 1. 0. 0. 1. 0. 0.] [0.5 0.  0.  0.  0.  0.5]
5 [False False  True  True]


2.0 [1. 3. 1. 1.]
2.0 [0. 0. 1. 1.]
[[ 0.          0.          6.61419697  6.61025493]
 [ 0.          0.         -1.8843866  -1.88427947]
 [ 0.          0.         -1.8843866  -1.88427947]
 [ 0.          0.         -1.8843866  -1.88427947]
 [ 0.          0.          6.61419697  6.61025493]
 [ 0.          0.         -1.8843866  -1.88427947]
 [ 0.          0.         -1.8843866  -1.88427947]
 [ 0.          0.          6.61419697  6.61025493]
 [ 0.          0.         -1.8843866  -1.88427947]
 [ 0.          0.         -1.8843866  -1.88427947]]



Start next
[1. 0. 1. 0. 1. 0. 0. 0. 0. 0.] [0.50096049 0.         0.         0.         0.         0.49903951]
5 [False False  True  True]


1.0 [2. 3. 1. 0.]
1.0 [0. 0. 1. 0.]
[[ 0.          0.         13.23596232  6.61025493]
 [ 0.          0.         -3.76933523 -1.88427947]
 [ 0.          0.          4.73027537 -1.88427947]
 [ 0.          0.    

ZeroDivisionError: division by zero