# Multi Armed Bandits Implementations
Will be exploring cases of explore first greedy, epsilon greedy, UCB, and gradient bandit

## File and Data setup

In [98]:
import numpy as np
import random 
import matplotlib.pyplot as plt
mu, sigma, num_arms = 0, 1, 10
T = 1000
arm_means = np.random.normal(mu, sigma, num_arms)

# roll T amount of times
# Format: arms[r][a] where (r =  roll #) and (a = arm #) 
arms = np.zeros(num_arms * T).reshape(T, num_arms)
for i in range(T):
    arms[i] = np.random.normal(arm_means,1,num_arms) 

# Calculate Ideal
ideal = []
ideal.append(max(arms[0]))
for i in range(1,T):
    ideal.append(max(arms[i]) + ideal[i - 1])

## Explore First Greedy Data Collection

In [188]:
# Calculate Explore first Greedy
N = 25
greedy_arr = np.zeros(N*T).reshape(N,T)

for exploreAmount in range(1,N + 1):
    curr_greedy = np.zeros(T)
    explore_arr = np.zeros(num_arms * exploreAmount).reshape(num_arms,exploreAmount)
    
    for a in range(num_arms):
        for i in range(exploreAmount):
            explore_arr[a][i] = arms[a * exploreAmount + i][a]
            curr_greedy[a * exploreAmount + i] = arms[a * exploreAmount + i][a]
    
    arm_sample_mean = np.zeros(num_arms)
    arm_sample_mean =  np.mean(explore_arr, axis= 1)
    best = arm_sample_mean.argmax()
    for i in range(exploreAmount * num_arms, T):
        curr_greedy[i] = arms[i][best]
    
    greedy_arr[exploreAmount-1] = curr_greedy 

greedy_perf = np.zeros(N)
greedy_perf = np.sum(greedy_arr, axis=1)

print(greedy_perf)
print(ideal[T-1])

[ 918.6053383   906.28761331  899.46316847 1121.43778792 1665.65051889
 1202.6694214  1191.20385956 1633.90146588 1621.21265836 1064.93592984
  860.73295908 1585.38203193 1562.23948964 1042.06035162 1135.64642677
 1121.38007022 1519.75224506 1503.55406367 1491.62165532 1471.49878703
 1442.83783133 1426.32624881 1403.61439393 1391.11163107 1388.89056196]
2471.6769594673915


## Explore First Greedy Graphing

In [189]:
# plt.plot( )
# plt.title("Regret vs. N")
# plt.xlabel("N")
# plt.ylabel("Regret")

## Epsilon Greedy Data Collection

In [193]:
# Calculate E - Greedy
E = 0.1
exploreAmount = 10
curr_greedy = np.zeros(T)

# print(arms[:10])
explore_arr = np.zeros(num_arms * exploreAmount).reshape(num_arms,exploreAmount)
for a in range(num_arms):
    for i in range(exploreAmount):
        explore_arr[a][i] = arms[a * exploreAmount + i][a]
        curr_greedy[a * exploreAmount + i] = arms[a * exploreAmount + i][a]

arm_sample_mean = np.zeros(num_arms * 2).reshape(num_arms,2)
arm_sample_mean[:,0] =  np.mean(explore_arr, axis= 1)
arm_sample_mean[:,1] = np.full(num_arms, exploreAmount)


for i in range(num_arms * exploreAmount, T):
    prob = random.random()
    if prob >= E:
        arm = arm_sample_mean[:,0].argmax()
    else:
        # Explore
        arm = random.randint(0,9)
    
    curr_greedy[i] = arms[i][arm]
    old_sample_mean = arm_sample_mean[arm][0]
    old_sample_amount = arm_sample_mean[arm][1]
    arm_sample_mean[arm][0] = old_sample_mean * old_sample_amount / (old_sample_amount + 1) + 1 / (old_sample_amount) * arms[i][arm]
    arm_sample_mean[arm][1] += 1

E_means_perf = np.sum(curr_greedy)

print(E_means_perf)
print(ideal[T - 1])



1490.8009799984598
2471.6769594673915


In [194]:
# Calculate UCB

In [195]:
# Calculate Gradient