In [1]:
from distributions.distribution_by_sequence import DistributionBySequence
from distributions.distribution import Distribution
from distributions.sequence import Sequence
from algorithms.semi_bandit_exp3 import SemiBanditExp3
from algorithms.full_bandit_exp3 import FullBanditExp3
from algorithms.semi_bandit_ftrl import SemiBanditFTRL
from algorithms.semi_bandit_ftrl_inv import SemiBanditFTRLInv
from algorithms.full_bandit_exp3_inv import FullBanditExp3Inv
from algorithms.real_lin_exp3 import RealLinExp3
from algorithms.uniform_random import UniformRandom
from algorithms.non_contextual_exp3 import NonContextualExp3
from algorithms.one_per_context import OnePerContext
from experiment_manager.experiment_manager import ExperimentManager

from misc.tensor_helpers import *

from distributions.actionsets.msets import MSets

from distributions.contexts.binary_context import BinaryContext
from distributions.thetas.single_hole import SingleHole
from distributions.thetas.independent_bernoulli import IndependentBernoulli

rng = np.random.default_rng()

In [2]:
def get_dist(rng, d, K, m):
    p = np.zeros((d, K)) + 1
    for i in range(d):
        placed_already = []
        while len(placed_already) < m:

            index = rng.integers(K)
            if index not in placed_already:
                placed_already.append(index)
                p[i, index] = 0
    
    return IndependentBernoulli(d, K, p)


In [3]:
algo = FullBanditExp3Inv()
algo = SemiBanditFTRLInv()
#algo = SemiBanditExp3()

length = 10000
d = 3
K = 8
m  = 3
actionset = MSets(K, m)

epsilon = 0.25 * np.min([np.sqrt(K / length), 1])
p = np.zeros((d, K)) + 0.5
for i in range(d):
    p[i, 0] = 0.3

dist_lower_bound = Distribution(BinaryContext(d), IndependentBernoulli(d, K, p), actionset)
dist_lower_bound = Distribution(BinaryContext(d), get_dist(rng, d, K, m), actionset)

seq = dist_lower_bound.generate(length, rng, rng)
algo.set_constants(rng, seq)
print("sigma:", seq.sigma, "m:", seq.m, "beta:", algo.beta, "gamma:", algo.gamma, "eta", algo.eta)
#algo.gamma = 0.06887868239885406
#algo.eta = 0.0010063304877787227
print(algo.gamma, algo.eta)
loss, _, _, _ = algo.run_on_sequence(rng, seq)
loss_of_optimal_policy, _, _ = seq.find_optimal_policy()

comperator = UniformRandom()
comperator.set_constants(rng, seq)
loss_comperator, _, _, _ = comperator.run_on_sequence(rng, seq)

no_log_term = RealLinExp3()
no_log_term.set_constants(rng, seq)
no_log_term.gamma = np.sqrt(1 / seq.length)
no_log_term.eta = np.sqrt(np.log(len(seq.actionset.actionset)) / (seq.d * len(seq.actionset.actionset) * seq.length))
print(no_log_term.gamma, no_log_term.eta)
loss_no_log_term, _, _, _ = no_log_term.run_on_sequence(rng, seq)

comperator2 = OnePerContext()
comperator2.set_constants(rng, seq)
loss_comperator2, _, _, _ = comperator2.run_on_sequence(rng, seq)
context_algo = comperator2.context_algos[list(comperator2.context_algos.keys())[0]]
print(context_algo.gamma, context_algo.eta)

comperator3 = FullBanditExp3Inv()
comperator3.set_constants(rng, seq)
print(comperator3.gamma, comperator3.eta)
loss_comperator3, _, _, _ = comperator3.run_on_sequence(rng, seq)
print("regret", loss - loss_of_optimal_policy, loss_no_log_term - loss_of_optimal_policy, loss_comperator - loss_of_optimal_policy, loss_comperator2 - loss_of_optimal_policy, loss_comperator3 - loss_of_optimal_policy)


sigma: 1.0 m: 3 beta: 0.5 gamma: 0.2201693742868639 eta 0.00045419577688269946
0.2201693742868639 0.00045419577688269946
0.01 0.0015479155897711724
0.17019151882935946 0.007091313284556643
0.6020470780301784 0.0002228702614589316
regret 9120.0 4308.0 18761.0 4759.0 11557.0


In [4]:
algo = SemiBanditFTRLInv()
algo.set_constants(rng, seq)

seq.reset()
context, _, _, done = seq.get_next(None)

losses = []
probability_array = []
action_array = []
while not done:
    probabilities = algo.get_policy(context)
    probability_array.append(probabilities)

    print("Start next")
    print(context, probabilities)
    action_index = rng.choice(np.arange(seq.actionset.number_of_actions), p=probabilities)
    print(action_index, seq.actionset[action_index])
    action_array.append(action_index)

    print("\n")
    next_context, loss, loss_vec, done = seq.get_next(seq.actionset[action_index])
    print(loss, loss_vec)
    loss_vec[~seq.actionset[action_index]] = 0
    print(loss, loss_vec)
    if algo.full_bandit:
        algo.observe_loss(loss, context, action_index)
    else:
        algo.observe_loss_vec(loss_vec, context, action_index)

    print(algo.theta_estimate)
    print("\n\n")
    losses.append(loss)
    context = next_context

    if seq.current_index == 100:
        1/0


Start next
[1. 0. 0.] [0.17086862 0.09747883 0.         0.         0.09747883 0.
 0.         0.         0.         0.19495766 0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.07338979 0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.07338979 0.         0.         0.
 0.         0.29243648]
9 [False False  True  True  True False False False]


1.0 [1. 1. 1. 0. 0. 0. 1. 1.]
1.0 [0. 0. 1. 0. 0. 0. 0. 0.]
[[0.         0.         8.13391072 0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.        ]]



Start next
[0. 1. 0.] [0.17086862 0.09747883 0.   

ZeroDivisionError: division by zero