In [1]:
import matplotlib.pyplot as plt
import numpy as np
from numpy.random import rand, seed

In [31]:
# EXAMPLE OF A STATIONARY k-BANDIT PROBLEM
# Parameters
NK = 2
ITEMAX = 4
EPSILON_M = [0.0, 0.1]
NEPISODES = 1000

seed(1234)

# Initialize actual distribution of each bandit
NMEANS = np.random.normal(loc=0.0, scale=1.0, size=NK)

print(NMEANS)

[ 0.47143516 -1.19097569]


In [2]:
def optimal_action(qvalue, eps):
    """
    Determines what is the action to take given a measure of past
    expected rewards across actions. With probability eps the action
    is not the greedy one
    """
    nactions = qvalue.shape[0]
    action_hat = np.where(qvalue == np.max(qvalue))

    if rand() <= eps:
        randnum = rand()
        for aa in range(nactions):
            if randnum < (aa + 1) / nactions:
                break
    elif action_hat[0].shape[0] > 1:
        # Randomize action when ties
        randnum = rand()
        for aa in range(action_hat[0].shape[0]):
            if randnum < (aa + 1) / action_hat[0].shape[0]:
                break
        aa = action_hat[0][aa]
    else:
        aa = np.argmax(qvalue)

    return aa

In [3]:
def reward_update(action, reward, qvalue_old, alpha):
    qvalue_new = qvalue_old.copy()

    qvalue_new[action] = qvalue_old[action] + alpha * (reward - qvalue_old[action])

    return qvalue_new

In [34]:
EPSILON = 0.1
ALPHA = 0.2

seed(1234)

reward_avg = np.zeros((ITEMAX, 2))
optimal_avg = np.zeros((ITEMAX, 2))

In [23]:
NEPISODES = 3
# For stationary
for ee in range(2):
    epsilon = EPSILON_M[ee]
    for run in range(NEPISODES):
        # Initialize q function and actions record
        qvalue = np.zeros((NK))
        nchoices = np.zeros((NK))
        for tt in range(ITEMAX):
            aa_opt = optimal_action(qvalue, epsilon)
            reward = np.random.normal(loc=NMEANS[aa_opt], scale=1.0)
            nchoices[aa_opt] += 1  # update with avg. number of times aa_opt was chosen
            qvalue = reward_update(aa_opt, reward, qvalue, 1 / nchoices[aa_opt])
            reward_avg[tt, ee] += reward / NEPISODES
            optimal_avg[tt, ee] += (aa_opt == np.argmax(NMEANS)) / NEPISODES

In [46]:
qvalue = np.zeros((NK))
nchoices = np.zeros((NK))
epsilon = 0.1

In [51]:
seed(1234)
for tt in range(ITEMAX):
    aa_opt = optimal_action(qvalue, epsilon)
    reward = np.random.normal(loc=NMEANS[aa_opt], scale=1.0)
    nchoices[aa_opt] += 1  # update with avg. number of times aa_opt was chosen
    qvalue = reward_update(aa_opt, reward, qvalue, 1 / nchoices[aa_opt])
    reward_avg[tt, ee] += reward / NEPISODES
    optimal_avg[tt, ee] += (aa_opt == np.argmax(NMEANS)) / NEPISODES

In [52]:
qvalue

array([ 0.14104231, -0.49943311])