Greedy Approach

In [None]:
import random
import numpy as np
import math

In [None]:
no_arms = 10
iterations = 1000
q = np.zeros((no_arms, 1))
count = np.zeros((no_arms, 1))

In [None]:
def greedy_assign_reward():
  for i in range(iterations):
    if i < 100:
      arm = random.randint(1, no_arms)
    else:
      arm = np.argmax(q) + 1
    count[arm - 1] = count[arm - 1] + 1
    reward = random.randint(1, 100)
    q[arm - 1] = q[arm - 1] + int((1 / count[arm - 1]) * (reward - q[arm - 1]))
  return q, np.sum(q)

In [None]:
q, total_reward = greedy_assign_reward()

In [None]:
q, total_reward

(array([[52.],
        [52.],
        [52.],
        [35.],
        [42.],
        [49.],
        [51.],
        [55.],
        [44.],
        [50.]]), 482.0)

Epsilon Greedy Approach

In [None]:
no_arms = 10
iterations = 1000
epsilon = 0.8
q = np.zeros((no_arms, 1))
count = np.zeros((no_arms, 1))

In [None]:
def egreedy_assign_reward():
  for i in range(iterations):
    prob_val = random.randint(0, 1)
    if prob_val < epsilon:
      arm = random.randint(1, no_arms)
    else:
      arm = np.argmax(q) + 1
    count[arm - 1] = count[arm - 1] + 1
    reward = random.randint(1, 100)
    q[arm - 1] = q[arm - 1] + int((1 / count[arm - 1]) * (reward - q[arm - 1]))
  return q, np.sum(q)

In [None]:
q, total_reward = egreedy_assign_reward()

In [None]:
q, total_reward

(array([[54.],
        [56.],
        [47.],
        [50.],
        [47.],
        [56.],
        [44.],
        [54.],
        [47.],
        [48.]]), 503.0)

Decayed Epsilon

In [None]:
no_arms = 10
iterations = 10000
q = np.zeros((no_arms, 1))
count = np.zeros((no_arms, 1))

In [None]:
def d_e_greedy_assign_reward():
  for i in range(iterations):
    epsilon = 1 / math.log(i + 0.00001)
    prob_val = random.randint(0, 1)
    if prob_val < epsilon:
      arm = random.randint(1, no_arms)
    else:
      arm = np.argmax(q) + 1
    count[arm - 1] = count[arm - 1] + 1
    reward = random.randint(1, 100)
    q[arm - 1] = q[arm - 1] + int((1 / count[arm - 1]) * (reward - q[arm - 1]))
  return q, np.sum(q)

In [None]:
q, total_reward = d_e_greedy_assign_reward()

In [None]:
q, total_reward

(array([[52.],
        [48.],
        [52.],
        [44.],
        [53.],
        [58.],
        [46.],
        [52.],
        [56.],
        [50.]]), 511.0)

Upper Confidence Bound

In [None]:
no_arms = 10
iterations = 1000
q = np.zeros((no_arms, 1))
count = np.zeros((no_arms, 1))

In [None]:
def ucb_assign_reward():
  for i in range(iterations):
    if i < no_arms:
      arm = i + 1
    else:
      for j in range(no_arms):
        arm = 0
        q[j] = q[j] + int(np.sqrt((2*np.log(i))/count[j]))
      arm = np.argmax(q) + 1
    count[arm - 1] = count[arm - 1] + 1
    reward = random.randint(1, 100)
    q[arm - 1] = q[arm - 1] + int((1 / count[arm - 1]) * (reward - q[arm - 1]))
  return q, np.sum(q)

In [None]:
q, total_reward = ucb_assign_reward()

In [None]:
q, total_reward

(array([[59.],
        [59.],
        [59.],
        [58.],
        [59.],
        [59.],
        [60.],
        [59.],
        [60.],
        [60.]]), 592.0)

Incremental Uniform

In [None]:
no_arms = 10
iterations = 1000
q = np.zeros((no_arms, 1))
count = np.zeros((no_arms, 1))

In [None]:
def inc_uniform():
  for i in range(iterations):
    arm = (i % no_arms) + 1
    count[arm - 1] = count[arm - 1] + 1
    reward = random.randint(1, 100)
    q[arm - 1] = q[arm - 1] + int((1 / count[arm - 1]) * (reward - q[arm - 1]))
  return q, np.sum(q)

In [None]:
q, total_reward = inc_uniform()

In [None]:
q, total_reward

(array([[49.],
        [44.],
        [50.],
        [56.],
        [53.],
        [50.],
        [48.],
        [48.],
        [53.],
        [53.]]), 504.0)