## Setup



In [0]:
import numpy as np

q = np.random.normal(0.0, 2.0, size=10)
std = 0.5

## Action Value Estimation

In [0]:
q_a = np.array([0.0] * len(q))
n_a = np.array([0] * len(q))

for _ in range(1000):
    action = np.random.randint(10)
    reward = np.random.normal(q[action], std)
    n_a[action] += 1
    q_a[action] += (reward - q_a[action]) / n_a[action]

In [3]:
print(q)
print(q_a)

[-0.65626338  3.30902524  1.24768379 -1.71470725  3.2194856  -1.12777154
 -0.19602347 -2.62634229 -3.04308601  2.27663233]
[-0.64670309  3.34310222  1.25874743 -1.71544833  3.19432743 -1.09342998
 -0.08219212 -2.69294766 -3.07612633  2.23282898]


## $\epsilon$-greedy Methods


In [4]:
q_a = np.array([0.0] * len(q))
n_a = np.array([0] * len(q))

def greedy_epsilon(epsilon):
  for _ in range(5000):
    action = None
    if np.random.random() < 1 - epsilon:
      action = np.argmax(q_a)
    else:
      action = np.random.randint(10)
    reward = np.random.normal(q[action], std)
    n_a[action] += 1
    q_a[action] += (reward - q_a[action]) / n_a[action]

greedy_epsilon(epsilon = 0.1)

print(q)
print(q_a)

[-0.65626338  3.30902524  1.24768379 -1.71470725  3.2194856  -1.12777154
 -0.19602347 -2.62634229 -3.04308601  2.27663233]
[-0.56538706  3.31138381  1.25464436 -1.58737853  3.23737234 -1.06952858
 -0.21815308 -2.64531225 -3.02194455  2.19989203]


## Optimistic Initialization


In [5]:
q_a = np.array([5.0] * len(q))
greedy_epsilon(epsilon = 0.1)

print(q)
print(q_a)

[-0.65626338  3.30902524  1.24768379 -1.71470725  3.2194856  -1.12777154
 -0.19602347 -2.62634229 -3.04308601  2.27663233]
[1.98322215 4.15237135 3.6670228  1.49351269 4.12881335 1.85575876
 2.00592789 0.50795738 1.5918313  3.7399046 ]


## Moving Rewards


In [0]:
def alpha(action):
  # return 1/n_a[action]
  return 0.1

## Upper-Confidence Bound Action Selection


In [7]:
q_a = np.array([0.0] * len(q))
n_a = np.array([0] * len(q))

def ucb(c):
  for t in range(5000):
    action = np.argmax([q_a[i] + c * np.sqrt(np.log(t+1)/np.max([n_a[i], 1])) for i in range(len(q_a))])
    reward = np.random.normal(q[action], std)
    n_a[action] += 1
    q_a[action] += alpha(action) * (reward - q_a[action])

ucb(c = 2)
print(q)
print(q_a)

[-0.65626338  3.30902524  1.24768379 -1.71470725  3.2194856  -1.12777154
 -0.19602347 -2.62634229 -3.04308601  2.27663233]
[-0.36246185  2.92719078  0.750019   -0.49707868  3.14865734 -0.30861423
 -0.04593552 -0.82837246 -0.67616517  1.91434115]


## Gradient Bandits


In [8]:
q_a = np.array([0.0] * len(q))
n_a = np.array([0] * len(q))

def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)

def sgd(a):
  reward_avg = 0
  for t in range(5000):
    random_num = np.random.random()
    sm = softmax(q_a)
    action = None
    for i in range(len(sm)):
      random_num -= sm[i]
      if random_num <= 0:
        action = i
        break
    
    n_a[action] += 1
    reward = np.random.normal(q[action], std)
    reward_avg += 1/(t+1) * (reward - reward_avg)

    q_a[action] += a * (reward - reward_avg) * (1 - sm[action])
    for i in range(len(sm)):
      if i != action:
        q_a[i] -= a * (reward - reward_avg) * (sm[i])

sgd(0.1)
print(q)
print(q_a)

[-0.65626338  3.30902524  1.24768379 -1.71470725  3.2194856  -1.12777154
 -0.19602347 -2.62634229 -3.04308601  2.27663233]
[-1.82665961  9.34359328 -1.04064991 -1.7707259   2.99920385 -1.23354222
 -1.91125836 -2.35811435 -1.87262499 -0.3292218 ]
