# Init

In [8]:
# coding: utf-8

import numpy as np
import matplotlib.pyplot as plt
import time
from math import log
from math import sqrt

K = 1000 # number of arms
d = 10 # dimension
T = 100 # time horizon
maxT = 10000
M = 100 # number of agents
C = 0.5 # constant
lam = 1 # global parameter
ins = [0] * (K + 1)
tracex = []
tracey = []
CommuCost = 0

# Generate linear bandits sample
# ins[0]: theta*
# ins[1:K]: K arms
np.random.seed(187)

# Run

In [None]:
# Sample from arm k
def sample(k) :
    global ins
    return np.asscalar(np.dot(ins[0].T, ins[k])) + np.random.normal()

# solve the inner argmax problem
# max <x, theta> s.t. ||theta - theta_hat||_2 <= radius
#def solveArgmax(x, theta_hat, radius) :
#    return np.asscalar(np.dot(x.T, theta_hat)) + radius * np.linalg.norm(x) / sqrt(np.asscalar(x.T @ V @ x))

# solve the inner argmax problem
# max <x, theta> s.t. ||theta||_V <= radius
def solveArgmax(x, Vinv, radius) :
    return radius * sqrt(np.asscalar(x.T @ Vinv @ x))

# compute matrix square root
#def sqrtMat(A) :
#   u, s, v = np.linalg.svd(A)
#    s = np.diag(np.sqrt(s))
#    return u @ s @ v

# Run for horizon T
def simulate() :
    global K, d, T, C, lam, ins, CommuCost
    CumRegret = 0 # np.zeros((M, T + 1))
    regret = [-200]
    for i in range(K) :
        regret.append(np.asscalar(np.dot(ins[0].T, ins[i + 1])))
        if (regret[i + 1] > regret[0]) :
            regret[0] = regret[i + 1]
    for i in range(K) :
        regret[i + 1] = regret[0] - regret[i + 1]

    V = [0] * M
    U = [0] * M
    Vlast = lam * np.identity(d)
    (sgn, VlastLogd_tmp) =  np.linalg.slogdet(Vlast)
    VlastLogd = [0] * M
    bestarm = [1] * M
    for i in range(M) :
        V[i] = lam * np.identity(d)
        U[i] = np.zeros((d, 1))
        VlastLogd[i] = VlastLogd_tmp
        
    signal = False
    for t in range(1, T + 1) :
        for m in range(M) :        
            (sgn, logd) =  np.linalg.slogdet(V[m])
            if (t == 1 or signal or logd > log(1 + C) + VlastLogd[m]) :
                bestarm[m] = 1
                bestval = -100000
                Vinv = np.linalg.inv(V[m])
                theta_hat = Vinv @ U[m]
                radius = sqrt(logd - d * log(lam) + 2 * log(T)) + sqrt(lam) # delta = 1 / T
                for i in range(1, K + 1) :
                    val = np.asscalar(np.dot(ins[i].T, theta_hat)) + solveArgmax(ins[i], Vinv, radius)
                    if (val > bestval) :
                        bestval = val
                        bestarm[m] = i

                VlastLogd[m] = logd
                if (signal and m == M - 1) :
                    signal = False
                    
#            access[bestarm] += 1
            y = sample(bestarm[m])
            #CumRegret[m][t] = CumRegret[m][t - 1] + regret[bestarm[m]]
            CumRegret += regret[bestarm[m]]
            V[m] += ins[bestarm[m]] @ ins[bestarm[m]].T
            U[m] += y * ins[bestarm[m]]
        if (t == int(T / sqrt(M))) : # naive communication strategy
            CommuCost += 2 * M * (d * d + d)
            Vagg = np.zeros((d, d))
            Uagg = np.zeros((d, 1))
            for m in range(M) :
                Vagg += V[m]
                Uagg += U[m]
            Vagg -= (M - 1) * lam * np.identity(d)
            for m in range(M) :
                V[m] = np.copy(Vagg)
                U[m] = np.copy(Uagg)
            signal = True
        #print(t)
    return CumRegret
    #tracex = range(1, T + 1)
    #print(len(tracex))
    #print(len(tracey))
    #tracey = np.sum(CumRegret, axis = 0)
    #print(CumRegret)
    #print(tracey)
if __name__ == '__main__' :
    counter = 0
    while (T <= maxT) :
        print(T)
        Reg = 0
        for experi in range(5) :
            counter += 1
            Theta = np.random.normal(size=d)
            rho = 2 / sqrt(T)
            #Theta = Theta / np.linalg.norm(Theta)
            X=np.random.random(size=(K, d))*rho - np.ones((K, d))*0.5*rho
            ins[0] = Theta.reshape((d, 1))
            for i in range(1, K + 1) :
                ins[i] = X[i - 1].reshape((d, 1))
            Reg += simulate()
        tracex.append(T)
        tracey.append(Reg / 5)
        #print(trace_test)
        T += 200

    CommuCost = CommuCost * 8 / counter
    print(CommuCost)
    File = open('LinUCB_naive.txt', 'w')
    File.write(str(CommuCost))
    File.write('\n')
    for k in range(len(tracex)) :
        File.write(str(tracex[k]) + str(' ') + str(tracey[k]))
        File.write('\n')
    File.close()

    plt.plot(tracex, tracey, label = 'LinUCB_naive')
    plt.xlabel('Time')
    plt.ylabel('Cumulative Regret')
    plt.legend()
    plt.savefig('LinUCB_naive.png')

    Finish = open('LinUCB_naive_finish.txt', 'w')
    Finish.close()


100
300
500
700
900
1100
1300
1500
1700
