In [14]:
import numpy as np
from scipy.stats import bernoulli

In [80]:
def initialize(K, B, min_mu_cost):
    """
    Parameters
    K - Number of arms
    B - Budget
    min_mu_cost - The minimum cost in the bernoulli trial
    Returns
    max_size - 2*B/min_cost
    N - A 2d array of size max_size and arms containing the total number of times an arm has been played at each time step
    X - A 2d array of size max_size and arms containing total reward obtained at each time step
    C - A 2d array of size max_size and arms containing total cost obtained at each time step
    """
    max_size = int(2*B/min_mu_cost)
    N = np.zeros((max_size, K))
    X = np.zeros((max_size, K))
    C = np.zeros((max_size, K))
    arm_pulled = np.zeros(max_size)
    return N,X,C,arm_pulled

def UCB_BV2(X, C, N, lam, t):
    #D_t = X[t]/C[t] + (1 + 1/(lam - np.sqrt(np.log(t)/N[t])))*np.sqrt(np.log(t)/N[t])*1/lam
    av_rew = X[t]/N[t]
    av_cost = C[t]/N[t]
    av_cost[(av_cost - 0) <= 1e-5] = 1e-10
    exploit = av_rew/av_cost
    sq_term = np.sqrt(np.log(t)/N[t])
    explore = (1 + 1/(lam - sq_term))*sq_term*1/lam
    #print(exploit)
    #print(explore)
    D_t = exploit+explore
    return np.argmax(D_t)
    
def main_algo(K, B, reward_means, cost_means):
    """
    Parameters
    K - Number of arms
    B - Budget
    reward_means - An array bernoulli means for rewards
    cost_means - An array bernoulli means for costs
    """
    
    def update(N, X, C, t):
        N[t] = N[t-1]
        X[t] = X[t-1]
        C[t] = C[t-1]
                       
    min_mu_cost = min(cost_means)
    #print(min_mu_cost)
    N, X, C = initialize(K, B, min_mu_cost)
    for arm in range(K):
        if arm > 0:
            update(N, X, C, arm)
        N[arm][arm] += 1
        X[arm][arm] += bernoulli.rvs(reward_means[arm], size=1)[0]
        C[arm][arm] += bernoulli.rvs(cost_means[arm], size=1)[0]
        arm_pulled[arm] = arm
        #print(C[i][i])
    #return N,X,C
    t = K-1
    #print(B)
    B -= np.sum(C[t])
    #print(B)
    while B > 0:
        t = t + 1
        ave_costs = C[t-1]/N[t-1]
        lam = np.min(ave_costs) ##Hack
        lam = lam if(lam - 0) > 1e-5 else 0.05
        #print(lam)
        arm = UCB_BV2(X, C, N, lam, t-1)
        arm_pulled[t] = arm
        #print(t)
        update(N, X, C, t)
        N[t][arm] += 1
        X[t][arm] += bernoulli.rvs(reward_means[arm], size=1)[0]
        cost = bernoulli.rvs(cost_means[arm], size=1)[0]
        C[t][arm] += cost
        B -= cost
    return N,X,C,arm_pulled,t

def compute_regret(X, C, arm_pulled, tB, best_arm):
    regret = np.zeros(tB)
    budget = np.zeros(tB)
    for i in range(tB):
        budget[i] = np.sum(C[t])
        arm = arm_pulled[i]
        r = C[arm]*(mu[best_arm]/cost[best_arm] - mu[arm_pulled]/cost[arm_pulled])
        regret[i] = n
        

In [81]:
k = 10
B = 100
#N,X,C = initialize(k, 100, 0.1)
reward_means = np.zeros(k)
for i in range(k):
    reward_means[i] = (0.8-0.01*i)
cost_means = np.random.choice(range(1,11), size = 10, replace = False)/10
N,X,C,arm_pulled,t = main_algo(k, B, reward_means, cost_means)
best_arm = np.argmax(reward_means/cost_means)

0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.05
0.1111111111111111
0.1111111111111111
0.1111111111111111
0.1111111111111111
0.1111111111111111
0.1111111111111111
0.1111111111111111
0.1111111111111111
0.1111111111111111
0.1111111111111111
0.1111111111111111
0.1111111111111111
0.1
0.09090909090909091
0.08333333333333333
0.07692307692307693
0.07142857142857142
0.06666666666666667
0.06666666666666667
0.0625
0.0625
0.0625
0.0625
0.0625
0.0625
0.0625
0.058823529411764705
0.05555555555555555
0.05263157894736842
0.05
0.05
0.05
0.05
0.047619047619047616
0.047619047619047616
0.047619047619047616
0.047619047619047616
0.047619047619047616
0.047619047619047616
0.047619047619047616
0.047619047619047616
0.045454545454545456
0.043478260869565216
0.043478260869565216
0.041666666666666664
0.04
0.038461538461538464
0.038461538461538464
0.038461538461538464
0.037037037037037035
0.037037037037037035
0.037037037037037035
0.0370370370370

In [79]:
C[t]

array([ 7.,  6., 12., 10.,  7.,  8., 13., 10., 13., 14.])