In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go


from mab import ABn

In [2]:
class Frame(object):
    """
    Instantiate Frame object either combi or stationary with their CTR
    """
    def __init__(self, ctr, name):
        self.ctr = ctr
        self.name = name
    
    def display_frame(self):
        """% of time frame object will be displayed to potential customers
        reward = 1 if click else 0
        
        # Note that ctr can we replaced with diff wilson_ctr
        """
        reward = np.random.binomial(n=1, p=self.ctr)
        return reward
    
    def __str__(self):
        return f'frame name = {self.name}'

In [3]:
Frame1 = Frame(0.004, "frme1")
Frame2 = Frame(0.016, "frme2")
Frame3 = Frame(0.02, "frme3")
Frame4 = Frame(0.028, "frme4")
Frame5 = Frame(0.031, "frme5")
frames = [Frame1, Frame2, Frame3, Frame4, Frame5]

# A/B/n test

$Q_{n+1} = Q_{n} + {1\over n}(R_{n} - Q_{n})$ 

where $Q_{n} = {R_{1} +R_{2} + ... + R_{n-1} \over n-1}$

In [None]:
n_test = 10000
n_prod = 50000

n_frames = len(frames)
q_values = np.zeros(n_frames)
imps = np.zeros(n_frames) # total impressions
total_reward = 0
avg_reward = []

In [None]:
for i in range(n_test):
    fr_chosen = np.random.randint(n_frames)
    reward = frames[fr_chosen].display_frame()
    
    imps[fr_chosen] += 1
    q_values[fr_chosen] += (1/imps[fr_chosen]) * (reward - q_values[fr_chosen])

    total_reward += reward
    avg_reward_so_far = total_reward/(i+1)
    avg_reward.append(avg_reward_so_far)

In [None]:
# Now showing best frame "all the time"
best_frame = frames[np.argmax(q_values)]
for i in range(n_prod):
    reward = best_frame.display_frame()
    total_reward += reward
    
    avg_reward_so_far = total_reward/(n_test + i + 1)
    avg_reward.append(avg_reward_so_far)
    
history["ABn"] = avg_reward

###  Using class example

In [None]:
abn_method = ABn("A/B/n method", frames)
abn_method.run_test(n_test =10000)

print(f"best frame = {abn_method.actions[np.argmax(abn_method.q_values)]}")
print(f"{abn_method.best_a}")

abn_method.run_prod(n_prod=50000)

# e-greedy

Q-update function is same as A/B/n test however two main differences

1. one additional hyperparameter $\epsilon$
2. Choose action with highest q_value from the start 

In [None]:
n_prod = 60000
eps = 0.1

n_frames = len(frames)
q_values = np.zeros(n_frames)
imps = np.zeros(n_frames)
total_reward = 0
avg_reward = []

In [None]:
fr_chosen = np.random.randint(n_frames)

for i in range(n_prod):
    reward = frames[fr_chosen].display_frame()
    
    imps[fr_chosen] += 1
    q_values[fr_chosen] += (1/imps[fr_chosen]) * (reward - q_values[fr_chosen])
    
    total_reward += reward
    avg_reward_so_far = total_reward/(i + 1)
    avg_reward.append(avg_reward_so_far)
    
    if np.random.uniform() <= eps:
        fr_chosen = np.random.randint(n_frames)
    else:
        fr_chosen = np.argmax(q_values)
        
history["e-greedy"] = avg_reward

# Upper Confidence Bounds (UCB)

$A_{t} = \underset{a}{\operatorname{argmax}}[Q_{t}(a) + c \sqrt{\ln t \over N_{t}(a)}]$

where $uncertainty = \sqrt{\ln t \over N_{t}(a)}$

- c = hyperparameter that tunes uncertanty measure.
- q_values gets updated in same way as ABn & e-greedy

In [7]:
n_prod = 60000
c = 0.1

n_frames = len(frames)
q_values = np.zeros(n_frames)
imps = np.zeros(n_frames)
total_reward = 0
avg_reward = []

fr_indices = np.array(range(n_frames))

In [8]:
imps

array([0., 0., 0., 0., 0.])

In [None]:
for i in range(n_prod):
    
    if any(imps==0): # randomly choose from frames with NO impressions 
        fr_chosen = np.random.choice(fr_indices[imps==0])
    else:
        uncertainty = np.sqrt(np.log(i+1) / imps)
        fr_chosen = np.argmax(q_values + c*uncertainty)
        
    reward = frames[fr_chosen].display_frame()
    imps[fr_chosen] += 1
    q_values[fr_chosen] += (1/imps[fr_chosen]) * (reward - q_values[fr_chosen])
    
    total_reward += reward
    avg_reward_so_far = total_reward/(i + 1)
    avg_reward.append(avg_reward_so_far)
    
history["UCB"] = avg_reward

# Thompson sampling

$p(\theta_{k}) = {{\tau (\alpha_{k} + \beta_{k})} \over {\tau (\alpha_{k}) \tau(\beta_{k}) }} \theta_{k}^{\alpha_{k-1}}(1-\theta_{k})^{\beta_{k} - 1}$

# Performance

In [None]:
hist_df = pd.DataFrame(history)

In [None]:
methods = hist_df.columns.tolist()

In [None]:
fig = go.Figure()

for method in methods:
    fig.add_trace(
        go.Scatter(
            x = hist_df.index,
            y = hist_df[method],
            name = method
        )
    )
    
fig.update_layout(title="<b>Comparative methods for MABs</b>",
                  xaxis_title = "n_prod",
                  yaxis_title = "Avg. Reward")

fig.show()