In [22]:
import numpy as np
from scipy.stats import bernoulli
from random import seed
from random import random

In [23]:
def initialize(k):
    beta_distn_reward = []
    beta_distn_cost = []

    for i in range(k):
        beta_distn_reward.append([0.0,0.0])

    for i in range(k):
        beta_distn_cost.append([0.0,0.0])

    return beta_distn_reward, beta_distn_cost

def sample_reward_and_cost(arm_pulled, mu, cost):
    reward_received = bernoulli.rvs(mu[arm_pulled], size=1)[0]
    cost_received = bernoulli.rvs(cost[arm_pulled], size=1)[0]
    return reward_received, cost_received

def update_distn_and_budget(arm_pulled, budget, beta_distn_reward, beta_distn_cost, reward_received, cost_received):
    budget -= cost_received
    # cost_records.append(cost_received)
    # reward_records.append(reward_received)
    # arm_pulled_records.append(arm_pulled)
    beta_distn_reward[arm_pulled][0] += reward_received
    beta_distn_reward[arm_pulled][1] += (1-reward_received)
    beta_distn_cost[arm_pulled][0] += cost_received
    beta_distn_cost[arm_pulled][1] += (1-cost_received)
    return budget, beta_distn_reward, beta_distn_cost

def choose_arm(beta_distn_reward, beta_distn_cost, k, budget, mu, cost):
    # reward_records = []
    # arm_pulled_records = []
    # cost_records = []

    sampled_mean_reward = np.array([0]*k, dtype=np.float)
    sampled_mean_cost = np.array([0]*k, dtype=np.float)

    if budget>0:
        for arm in range(k):
            sampled_mean_reward[arm] = np.random.beta(beta_distn_reward[arm][0]+1, beta_distn_reward[arm][1]+1)
            sampled_mean_cost[arm] = np.random.beta(beta_distn_cost[arm][0]+1, beta_distn_cost[arm][1]+1)
        arm_pulled = np.argmax(sampled_mean_reward/sampled_mean_cost)
        reward_received, cost_received = sample_reward_and_cost(arm_pulled, mu, cost)
        budget, beta_distn_reward, beta_distn_cost = update_distn_and_budget(arm_pulled, budget, beta_distn_reward, beta_distn_cost, reward_received,cost_received)

    return reward_received, cost_received, arm_pulled, budget, beta_distn_reward, beta_distn_cost

def compute_best_arm(mu, cost):
    return np.argmax(mu/cost)

In [24]:
def fairness_with_budget_thompson_sampling(mu, budget, k, cost, alpha = None, r = None):
    reward_records = []
    arm_pulled_records = []
    cost_records = []
    t=0
    arm_pulled_count = np.array([0]*k)

    beta_distn_reward, beta_distn_cost = initialize(k)

    while budget>0:
        unfair_arm = []
        unfair_val = []
        if(alpha is not None and r is not None):
            for i in range(k):
                if (r[i]*(t-1) - arm_pulled_count[i]) > alpha:
                    unfair_arm.append(i)
                    unfair_val.append(r[i]*(t-1) - arm_pulled_count[i])
        if unfair_arm:
            arm_pulled = unfair_arm[np.argmax(np.array(unfair_val))]
            reward_received, cost_received = sample_reward_and_cost(arm_pulled, mu, cost)
            # print(cost_received)
            budget, beta_distn_reward, beta_distn_cost = update_distn_and_budget(arm_pulled, budget, beta_distn_reward, beta_distn_cost, reward_received, cost_received)
        else:
            reward_received, cost_received, arm_pulled, budget, beta_distn_reward, beta_distn_cost = choose_arm(beta_distn_reward, beta_distn_cost, k, budget, mu, cost)
        
        t += 1
        arm_pulled_count[arm_pulled] += 1
        budget -= cost_received
        cost_records.append(cost_received)
        reward_records.append(reward_received)
        arm_pulled_records.append(arm_pulled)


    return arm_pulled_count, cost_records, reward_records, arm_pulled_records

In [25]:
def compute_regret(arm_pulled_records, mu, cost=[1]*k):
    regret_record = []
    sum_regret = 0.0
    best_arm = compute_best_arm(mu, cost)
    sz = len(arm_pulled_records)
    for i in range(sz):
        arm_pulled = arm_pulled_records[i]
        sum_regret += cost[arm_pulled]*(mu[best_arm]/cost[best_arm] - mu[arm_pulled]/cost[arm_pulled])
        regret_record.append(sum_regret)
    return regret_record

In [26]:
k = 10                          #Number of arms
mu = np.zeros(k)                #mean_rewards
# seed(1)
for i in range(k):
  mu[i] = random()

cost = np.zeros(k)              #mean_costs
for i in range(k):
    cost[i] = random()

B = budget = 10000              #budget
# r = np.zeros(k)
r = np.array([0.05]*k)          #fairness_array for different arms
alpha = 10000                       #tolerance_parameter for fairness
# T = 1000

###### FOR BUDGET AND FAIRNESS BOTH ########
arm_pulled_count, cost_records, reward_records, arm_pulled_records = fairness_with_budget_thompson_sampling(mu, budget, k, cost, alpha, r)
regret_record = compute_regret(arm_pulled_records, mu, cost)

###### FOR BUDGET ONLY ########

print (regret_record[-1])
print (len(regret_record))

511.4235622070859
91372
