# RS-UCB-M

Import the necessary libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random

Define the utility function

In [2]:
def Utility(p):
    return p*(1-p)

The parameter of the utility function is the weighted summation of arm means

In [3]:
def MixturesP(p, p_arms):
    p_out = 1-(p[0]*(1-p_arms[0]) + (p[1])*(1-p_arms[1]))
    return p_out

Define confidence set

In [4]:
def phi_inverse(x, v):
    return 2*np.sqrt(x/v)

At time instant t, choose possible alpha values and decide one of them randomly as for the 2-arm case the optimizer is unique

In [5]:
def UCBalpha(I, CS, K):  
    opt_alpha_SET = []
    for idx, alphaT in enumerate(I):
        C_temp = np.zeros((2))
        
        C_temp[0] = alphaT*CS[0][0]+(1-alphaT)*CS[1][0]
        C_temp[1] = alphaT*CS[0][1]+(1-alphaT)*CS[1][1]
                    



        if C_temp[0] <= 1/2 <= C_temp[1]:
            opt_alpha_SET.append(alphaT)
            
        else:
            pass
        
        
    return random.choice(opt_alpha_SET)

Main algorithm. First stage is exploring each arm the same amount. The second stage is undersampling according to the estimator chosen

In [6]:
def UCB(zeta=0.1, K=2, T = 10000, p1 = 0.35, p2 = 0.85, v=2):
    eps = np.sqrt(K/T)*np.sqrt(np.log(T))
                        
    I = np.linspace(0, 1, int(np.ceil(1/eps)))

    tau_T = np.zeros((K))

    pb = [p1, p2]

    emp_pb = np.zeros((K))
    
    CS = np.zeros((K, 2))
    
    regrets = []
    
    #EXPLORATION
    starttime = 0
    for i in range(K):
        for time in range(starttime, starttime + int(zeta*T)):
            reward = np.random.binomial(size=1, n=1, p=(pb[i]))
            
            emp_pb[i] = float(emp_pb[i]*tau_T[i] + reward)/(tau_T[i] + 1)
            
            tau_T[i] += 1
            
    
        starttime = time + 1
        
        
        
    for i in range(K):    
        CS[i][1] = min(emp_pb[i] + phi_inverse(np.log(time)/tau_T[i], v), 1)
        CS[i][0] = max(emp_pb[i] - phi_inverse(np.log(time)/tau_T[i], v), 0)
        
    opt_alpha = UCBalpha(I, CS, K)
        

    
    for time in range(starttime, T):
        chosen_arm = np.argmax(np.array([opt_alpha, 1-opt_alpha]) - tau_T/time)
        
        reward = np.random.binomial(size=1, n=1, p=(pb[chosen_arm]))
        
        emp_pb[chosen_arm] = float(emp_pb[chosen_arm]*tau_T[chosen_arm] + reward)/(tau_T[chosen_arm] + 1)

        tau_T[chosen_arm] += 1
        
        CS[i][1] = min(emp_pb[i] + phi_inverse(np.log(time)/tau_T[i], v), 1)
        CS[i][0] = max(emp_pb[i] - phi_inverse(np.log(time)/tau_T[i], v), 0)
        
        opt_alpha = UCBalpha(I, CS, K)

        
    regret = 1/4-Utility(MixturesP(tau_T/T, pb))
        
    return regret, emp_pb, tau_T, CS

    

## Means of Arms Experiments


eta_list := p1-p2 <br />
file_path := name of the txt file for data to be saved <br />
T : time horizon <br />
p1 := mean of the first arm <br />
p2 := mean of the second arm <br />
zeta := exploration coefficient <br />

In [8]:
eta_list = [0.1, 0.2, 0.4, 0.5]
p1=0.55

file_path = "Regrets_eta.txt"
for eta in eta_list:
    for save_idx in range(1000):
        regret, emp_pb, tau_T, CS = UCB(zeta=0.1, K=2, T = 100000, p1 = p1, p2 = p1-eta, v=2)
        

        with open(file_path, "a+") as file:
            file.write(f'T={100000}, zeta={0.1}, K=2, p1={p1}, p2={p1-eta}, RESULT: tau(T)={tau_T}, emp_pb={emp_pb} \n')


In [9]:
eta_list = [0.15, 0.25, 0.35, 0.45, 0.55]
p1=0.59

file_path = "Regrets_eta_2.txt"
for eta in eta_list:
    for save_idx in range(1000):
        regret, emp_pb, tau_T, CS = UCB(zeta=0.1, K=2, T = 100000, p1 = p1, p2 = p1-eta, v=2)
        

        with open(file_path, "a+") as file:
            file.write(f'T={100000}, zeta={0.1}, K=2, p1={p1}, p2={p1-eta}, RESULT: tau(T)={tau_T}, emp_pb={emp_pb} \n')


## Time Horizon Experiments


T_list := list of different time horizons <br />
file_path := name of the txt file for data to be saved <br />
p1 := mean of the first arm <br />
p2 := mean of the second arm <br />
zeta := exploration coefficient <br />

In [7]:
import os
T_list = [10000, 20000, 30000, 40000, 50000]
p1=0.4
p2=0.9

file_path = "Regrets_T.txt"
for T in T_list:
    for save_idx in range(1000):
        regret, emp_pb, tau_T, CS = UCB(zeta=0.1, K=2, T = T, p1 = p1, p2 = p2, v=2)
        

        with open(file_path, "a+") as file:
            file.write(f'T={T}, zeta={0.1}, K=2, p1={p1}, p2={p2}, RESULT: tau(T)={tau_T}, emp_pb={emp_pb} \n')