In [16]:
import numpy as np
import os
import pandas as pd
import plotly.graph_objects as go
import sys

In [17]:
class Frame(object):
    """
    Instantiate Frame object either combi or stationary with their CTR
    """
    def __init__(self, ctr, name):
        self.ctr = ctr
        self.name = name
    
    def display_frame(self):
        """% of time frame object will be displayed to potential customers
        reward = 1 if click else 0
        
        # Note that ctr can we replaced with diff wilson_ctr
        """
        reward = np.random.binomial(n=1, p=self.ctr)
        return reward
    
    def __str__(self):
        return f'frame name = {self.name}'

In [18]:
Frame1 = Frame(0.004, "frme1")
Frame2 = Frame(0.016, "frme2")
Frame3 = Frame(0.02, "frme3")
Frame4 = Frame(0.028, "frme4")
Frame5 = Frame(0.031, "frme5")
frames = [Frame1, Frame2, Frame3, Frame4, Frame5]

In [19]:
history = dict()

# A/B/n test

$Q_{n+1} = Q_{n} + {1\over n}(R_{n} - Q_{n})$ 

where $Q_{n} = {R_{1} +R_{2} + ... + R_{n-1} \over n-1}$

In [20]:
n_test = 10000
n_prod = 50000

n_frames = len(frames)
q_values = np.zeros(n_frames)
imps = np.zeros(n_frames) # total impressions
total_reward = 0
avg_reward = []

In [21]:
for i in range(n_test):
    fr_chosen = np.random.randint(n_frames)
    reward = frames[fr_chosen].display_frame()
    
    imps[fr_chosen] += 1
    q_values[fr_chosen] += (1/imps[fr_chosen]) * (reward - q_values[fr_chosen])

    total_reward += reward
    avg_reward_so_far = total_reward/(i+1)
    avg_reward.append(avg_reward_so_far)

In [22]:
# Now showing best frame "all the time"
best_frame = frames[np.argmax(q_values)]
for i in range(n_prod):
    reward = best_frame.display_frame()
    total_reward += reward
    
    avg_reward_so_far = total_reward/(n_test + i + 1)
    avg_reward.append(avg_reward_so_far)
    
history["ABn"] = avg_reward

In [23]:
print("ABn total_reward = ", total_reward)

ABn total_reward =  1651


# e-greedy

Q-update function is same as A/B/n test however two main differences

1. one additional hyperparameter $\epsilon$
2. Choose action with highest q_value from the start 

In [15]:
n_prod = 60000
eps = 0.1

n_frames = len(frames)
q_values = np.zeros(n_frames)
imps = np.zeros(n_frames)
total_reward = 0
avg_reward = []

In [16]:
fr_chosen = np.random.randint(n_frames)

for i in range(n_prod):
    reward = frames[fr_chosen].display_frame()
    
    imps[fr_chosen] += 1
    q_values[fr_chosen] += (1/imps[fr_chosen]) * (reward - q_values[fr_chosen])
    
    total_reward += reward
    avg_reward_so_far = total_reward/(i + 1)
    avg_reward.append(avg_reward_so_far)
    
    if np.random.uniform() <= eps:
        fr_chosen = np.random.randint(n_frames)
    else:
        fr_chosen = np.argmax(q_values)
        
history["e-greedy"] = avg_reward

In [17]:
print("eGreedy total_reward = ", total_reward)

eGreedy total_reward =  1682


# Upper Confidence Bounds (UCB)

$A_{t} = \underset{a}{\operatorname{argmax}}[Q_{t}(a) + c \sqrt{\ln t \over N_{t}(a)}]$

where $uncertainty = \sqrt{\ln t \over N_{t}(a)}$

- q_values gets updated in same way as ABn & e-greedy

hyperparameters:
- <b>c</b> : uncertainty measure.

In [18]:
n_prod = 60000
c = 0.1

n_frames = len(frames)
q_values = np.zeros(n_frames)
imps = np.zeros(n_frames)
total_reward = 0
avg_reward = []

fr_indices = np.array(range(n_frames))

In [20]:
for i in range(n_prod):
    if any(imps==0): # randomly choose from frames with NO impressions 
        fr_chosen = np.random.choice(fr_indices[imps==0])
    else:
        uncertainty = np.sqrt(np.log(i+1) / imps)
        fr_chosen = np.argmax(q_values + c*uncertainty)
        
    reward = frames[fr_chosen].display_frame()
    imps[fr_chosen] += 1
    q_values[fr_chosen] += (1/imps[fr_chosen]) * (reward - q_values[fr_chosen])
    
    total_reward += reward
    avg_reward_so_far = total_reward/(i + 1)
    avg_reward.append(avg_reward_so_far)
    
history["UCB"] = avg_reward

In [None]:
print("UCB total_reward = ", total_reward)

# Thompson sampling

$p(\theta_{k}) = {{\tau (\alpha_{k} + \beta_{k})} \over {\tau (\alpha_{k}) \tau(\beta_{k}) }} \theta_{k}^{\alpha_{k-1}}(1-\theta_{k})^{\beta_{k} - 1}$

In [None]:
n_prod = 60000
n_frames = len(frames)

# parameters in beta distribution, initialized to 1 here however we could use previous data to initialize
alphas = np.ones(n_frames)
betas = np.ones(n_frames)

total_reward = 0
avg_reward = []

In [None]:
for i in range(n_prod):
    # randomly selected from beta distribution for each alpha, beta pair
    theta_samples = [np.random.beta(alphas[k], betas[k])
                     for k in range(n_frames)]
    fr_chosen = np.argmax(theta_samples)
    reward = frames[fr_chosen].display_frame()
    alphas[fr_chosen] += reward
    betas[fr_chosen] += 1 - reward
    
    total_reward += reward
    avg_reward_so_far = total_reward/(i + 1)
    avg_reward.append(avg_reward_so_far)
    
history["ThompsonSampling"] = avg_reward

In [None]:
print("ThompsonSampling total_reward = ", total_reward)

# Performance

In [None]:
hist_df = pd.DataFrame(history)

In [None]:
methods = hist_df.columns.tolist()

In [None]:
fig = go.Figure()

for method in methods:
    fig.add_trace(
        go.Scatter(
            x = hist_df.index,
            y = hist_df[method],
            name = method
        )
    )
    
fig.update_layout(title="<b>Comparative methods for MABs</b>",
                  xaxis_title = "n_prod",
                  yaxis_title = "Avg. Reward")

fig.show()

# Class usage examples

In [None]:
Frame1 = Frame(0.004, "frme1")
Frame2 = Frame(0.016, "frme2")
Frame3 = Frame(0.02, "frme3")
Frame4 = Frame(0.028, "frme4")
Frame5 = Frame(0.031, "frme5")
frames = [Frame1, Frame2, Frame3, Frame4, Frame5]

# ABn
abn_method = ABn(frames)
abn_method.run_test(n_test=10000)
abn_method.run_prod(n_prod=50000)

# eGreedy
n_prod = 60000
eps = 0.1
egreedy = eGreedy(frames, eps)
egreedy.run(n_prod=n_prod)

# UCB
c = 0.1
ucb = UpperConfidenceBounds(frames, c)
ucb.run(n_prod=n_prod)

# Thompson Sampling
ts = ThompsonSampling(frames)
ts.run(n_prod=n_prod)

In [None]:
fig = go.Figure()

for method in [abn_method, egreedy, ucb, ts]:
    fig.add_trace(
        go.Scatter(
            x = list(range(n_prod)),
            y = method.avg_reward,
            name = method.name
        )
    )
    
fig.update_layout(title="<b>Comparative methods for MABs</b>",
                  xaxis_title = "n_prod",
                  yaxis_title = "Avg. Reward")

fig.show()