# Context-Free MAB

In [1]:
from mabwiser.mab import MAB, LearningPolicy

In [2]:
######################################################################################
#
# MABWiser
# Scenario: A/B Testing for Website Layout Design
#
# An e-commerce website experiments with 2 different layouts options for their homepage
# Each layouts decision leads to generating different revenues
#
# What should the choice of layouts be based on historical data?
#
######################################################################################

In [3]:
# Arms
options = [1, 2]

In [4]:
# Historical data of layouts decisions and corresponding rewards
layouts = [1, 1, 1, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 2, 1]
revenues = [10, 17, 22, 9, 4, 0, 7, 8, 20, 9, 50, 5, 7, 12, 10]

In [5]:
# Arm to features
arm_to_features = {1: [0, 0, 1], 2: [1, 1, 0], 3: [1, 1, 0]}

In [6]:
###################################
# Epsilon Greedy Learning Policy
###################################

In [7]:
# Epsilon Greedy learning policy with random exploration set to 15%
greedy = MAB(arms=options,
             learning_policy=LearningPolicy.EpsilonGreedy(epsilon=0.15),
             seed=123456)

In [8]:
# Learn from previous layouts decisions and revenues generated
greedy.fit(decisions=layouts, rewards=revenues)

In [9]:
# Predict the next best layouts decision
prediction = greedy.predict()

In [10]:
# Expected revenues of each layouts learnt from historical data based on epsilon greedy policy
expectations = greedy.predict_expectations()

In [11]:
# Results
print("Epsilon Greedy: ", prediction, " ", expectations)
assert(prediction == 2)

Epsilon Greedy:  2   {1: 10.875, 2: 14.714285714285714}


In [12]:
# Additional historical data becomes available which allows online learning
additional_layouts = [1, 2, 1, 2]
additional_revenues = [0, 12, 7, 19]

In [13]:
# Online updating of the model
greedy.partial_fit(additional_layouts, additional_revenues)

In [14]:
# Adding a new layout option
greedy.add_arm(3)

In [15]:
# Warm start new arm
greedy.warm_start(arm_to_features, distance_quantile=0.5)

# Parametric Contextual MAB

In [16]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from mabwiser.mab import MAB, LearningPolicy, NeighborhoodPolicy

In [17]:
######################################################################################
#
# MABWiser
# Scenario: Advertisement Optimization
#
# An e-commerce website needs to solve the problem of which ad to display to online users
# Each advertisement decision leads to generating different revenues
#
# What should the choice of advertisement be given the context of an online user
# based on customer data such as age, click rate, subscriber?
#
######################################################################################

In [18]:
# Arms
ads = [1, 2, 3, 4, 5]

In [19]:
# Historical data of ad decisions with corresponding revenues and context information
train_df = pd.DataFrame({'ad': [1, 1, 1, 2, 4, 5, 3, 3, 2, 1, 4, 5, 3, 2, 5],
                                                     'revenues': [10, 17, 22, 9, 4, 20, 7, 8, 20, 9, 50, 5, 7, 12, 10],
                                                     'age': [22, 27, 39, 48, 21, 20, 19, 37, 52, 26, 18, 42, 55, 57, 38],
                                                     'click_rate': [0.2, 0.6, 0.99, 0.68, 0.15, 0.23, 0.75, 0.17,
                                                                                    0.33, 0.65, 0.56, 0.22, 0.19, 0.11, 0.83],
                                                     'subscriber': [1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0]}
                                                     )

In [20]:
# Arm features for warm start
arm_to_features = {1: [0, 1, 1], 2: [0, 0.5, 0.5], 3: [1, 1, 0.5], 4: [0.2, 1, 0], 5: [0, 1, 0.1], 6: [0, 0.5, 0.5]}

In [21]:
# Test data to for new prediction
test_df = pd.DataFrame({'age': [37, 52], 'click_rate': [0.5, 0.6], 'subscriber': [0, 1]})
test_df_revenue = pd.Series([7, 13])

In [22]:
# Scale the training and test data
scaler = StandardScaler()
train = scaler.fit_transform(train_df[['age', 'click_rate', 'subscriber']])
test = scaler.transform(test_df)

In [23]:
##################################################
# Linear Upper Confidence Bound Learning Policy
##################################################

In [24]:
# LinUCB learning policy with alpha 1.25 and l2_lambda 1
linucb = MAB(arms=ads,
             learning_policy=LearningPolicy.LinUCB(alpha=1.25, l2_lambda=1))

In [25]:
# Learn from previous ads shown and revenues generated
linucb.fit(decisions=train_df['ad'], rewards=train_df['revenues'], contexts=train)

In [26]:
# Predict the next best ad to show
prediction = linucb.predict(test)

In [27]:
# Expectation of each ad based on learning from past ad revenues
expectations = linucb.predict_expectations(test)

In [28]:
# Results
print("LinUCB: ", prediction, " ", expectations)
assert(prediction == [5, 2])

LinUCB:  [5, 2]   [{1: -1.3094012976959437, 2: 0.10514406010286903, 3: -1.912976763975226, 4: -10.291130147595512, 5: 9.66516106096636}, {1: -4.524478702267455, 2: 14.652897134463046, 3: 4.52468441018687, 4: -3.257354449781908, 5: -8.684806295018186}]


In [29]:
# Online update of model
linucb.partial_fit(decisions=prediction, rewards=test_df_revenue, contexts=test)

In [30]:
# Update the model with new arm
linucb.add_arm(6)

In [31]:
# Warm start new arm
linucb.warm_start(arm_to_features, distance_quantile=0.75)

# Non-Parametric Contextual MAB

In [32]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from mabwiser.mab import MAB, LearningPolicy, NeighborhoodPolicy

In [33]:
######################################################################################
#
# MABWiser
# Scenario: Advertisement Optimization
#
# An e-commerce website needs to solve the problem of which ad to display to online users
# Each advertisement decision leads to generating different revenues
#
# What should the choice of advertisement be given the context of an online user
# based on customer data such as age, click rate, subscriber?
#
######################################################################################

In [34]:
# Arms
ads = [1, 2, 3, 4, 5]

In [35]:
# Historical data of ad decisions with corresponding revenues and context information
train_df = pd.DataFrame({'ad': [1, 1, 1, 2, 4, 5, 3, 3, 2, 1, 4, 5, 3, 2, 5],
                         'revenues': [10, 17, 22, 9, 4, 20, 7, 8, 20, 9, 50, 5, 7, 12, 10],
                         'age': [22, 27, 39, 48, 21, 20, 19, 37, 52, 26, 18, 42, 55, 57, 38],
                         'click_rate': [0.2, 0.6, 0.99, 0.68, 0.15, 0.23, 0.75, 0.17,
                                        0.33, 0.65, 0.56, 0.22, 0.19, 0.11, 0.83],
                         'subscriber': [1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0]}
                       )

In [36]:
# Arm features for warm start
arm_to_features = {1: [0, 1, 1], 2: [0, 0.5, 0.5], 3: [1, 1, 0.5], 4: [0.2, 1, 0], 5: [0, 1, 0.1], 6: [0, 0.5, 0.5]}

In [37]:
# Test data to for new prediction
test_df = pd.DataFrame({'age': [37, 52], 'click_rate': [0.5, 0.6], 'subscriber': [0, 1]})
test_df_revenue = pd.Series([7, 13])

In [38]:
# Scale the training and test data
scaler = StandardScaler()
train = scaler.fit_transform(train_df[['age', 'click_rate', 'subscriber']])
test = scaler.transform(test_df)

In [39]:
########################################################
# Radius Neighborhood Policy with UCB1 Learning Policy
########################################################

In [40]:
# Radius contextual policy with radius equals to 5 and ucb1 learning with alpha 1.25
radius = MAB(arms=ads,
             learning_policy=LearningPolicy.UCB1(alpha=1.25),
             neighborhood_policy=NeighborhoodPolicy.Radius(radius=5))

In [41]:
# Learn from previous ads shown and revenues generated
radius.fit(decisions=train_df['ad'], rewards=train_df['revenues'], contexts=train)

In [42]:
# Predict the next best ad to show
prediction = radius.predict(test)

In [43]:
# Expectation of each ad based on learning from past ad revenues
expectations = radius.predict_expectations(test)


In [44]:
# Results
print("Radius: ", prediction, " ", expectations)
assert(prediction == [4, 4])

Radius:  [4, 4]   [{1: 15.954532302704585, 2: 15.346215899689662, 3: 9.01288256635633, 4: 29.05701930939459, 5: 13.346215899689662}, {1: 15.954532302704585, 2: 15.346215899689662, 3: 9.01288256635633, 4: 29.05701930939459, 5: 13.346215899689662}]


In [45]:
# Online update of model
radius.partial_fit(decisions=prediction, rewards=test_df_revenue, contexts=test)

In [46]:
# Updating of the model with new arm
radius.add_arm(6)

In [47]:
# Warm start new arm
radius.warm_start(arm_to_features, distance_quantile=0.75)

# Parallel MAB

In [48]:
import numpy as np
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from mabwiser.mab import MAB, LearningPolicy

In [49]:
######################################################################################
#
# MABWiser
# Scenario: Playlist recommendation for music streaming service
#
# An online music streaming service wants to recommend a playlist to a user
# based on a user's listening history and user features. There is a large amount
# of data available to train this recommender model, which means the parallel
# functionality in MABWiser can be useful.
#
#
######################################################################################

In [50]:
# Seed
seed = 111

In [51]:
# Arms
arms = list(np.arange(100))

In [52]:
# Historical on user contexts and rewards (i.e. whether a user clicked
# on the recommended playlist or not)
contexts, rewards = make_classification(n_samples=100000, n_features=200,
                                        n_informative=20, weights=[0.01], scale=None)

In [53]:
# Independently simulate the recommended playlist for each event
decisions = np.random.choice(arms, size=100000)

In [54]:
# Split data into train and test data sets
contexts_train, contexts_test = train_test_split(contexts, test_size=0.3, random_state=seed)
rewards_train, rewards_test = train_test_split(rewards, test_size=0.3, random_state=seed)
decisions_train, decisions_test = train_test_split(decisions, test_size=0.3, random_state=seed)

In [55]:
#############################################################################
# Parallel Radius Neighborhood Policy with UCB1 Learning Policy using 8 Cores
#############################################################################

In [56]:
# Radius contextual policy with radius equals to 5 and ucb1 learning with alpha 1.25
radius = MAB(arms=ads,
                             learning_policy=LearningPolicy.UCB1(alpha=1.25),
                             neighborhood_policy=NeighborhoodPolicy.Radius(radius=5),
                             n_jobs=8)

In [57]:
# Parallel Training
# Learn from playlists shown and observed click rewards for each arm
# In reality, we can scale the data --skipping this step in the toy example here
radius.fit(decisions=decisions_train, rewards=rewards_train, contexts=contexts_train)

In [58]:
# Parallel Testing
# Predict the next best playlist to recommend
prediction = radius.predict(contexts_test)

In [59]:
# Results
print("radius: ", prediction[:10])

radius:  [4, 2, 2, 5, 1, 4, 4, 2, 5, 2]


# Simulator

In [60]:
import random
from sklearn.preprocessing import StandardScaler
from mabwiser.mab import MAB, LearningPolicy, NeighborhoodPolicy
from mabwiser.simulator import Simulator

In [61]:
######################################################################################
#
# MABWiser
# Scenario: Hyper-Parameter Tuning using the built-in Simulator capability
#
######################################################################################

In [62]:
# Data
size = 1000
decisions = [random.randint(0, 2) for _ in range(size)]
rewards = [random.randint(0, 1000) for _ in range(size)]
contexts = [[random.random() for _ in range(50)] for _ in range(size)]

In [63]:
# Bandits to simulate
n_jobs = 2
hyper_parameter_tuning = []
for radius in range(6, 10):
    hyper_parameter_tuning.append(('Radius'+str(radius),
                                  MAB([0, 1], LearningPolicy.UCB1(1),
                                      NeighborhoodPolicy.Radius(radius),
                                      n_jobs=n_jobs)))

In [64]:
# Simulator with given bandits and data
# The parameters uses standard scaler,
# Test split size set to 0.5
# The split is not order dependent, i.e., random split
# Online training with batch size 10, i.e., bandits will re-train at each batch
# Offline training can be run with batch_size 0, i.e., no re-training during test phase
sim = Simulator(hyper_parameter_tuning, decisions, rewards, contexts,
                scaler=StandardScaler(), test_size=0.5, is_ordered=False, batch_size=10, seed=123456)

2025-08-06 03:45:08,758 INFO Simulation Parameters
2025-08-06 03:45:08,760 INFO 	 bandits: [('Radius6', <mabwiser.mab.MAB object at 0x000001F0DA737A90>), ('Radius7', <mabwiser.mab.MAB object at 0x000001F0DB0DA310>), ('Radius8', <mabwiser.mab.MAB object at 0x000001F0DB32B510>), ('Radius9', <mabwiser.mab.MAB object at 0x000001F0DB32B850>)]
2025-08-06 03:45:08,763 INFO 	 scaler: StandardScaler()
2025-08-06 03:45:08,764 INFO 	 test_size: 0.5
2025-08-06 03:45:08,765 INFO 	 is_ordered: False
2025-08-06 03:45:08,766 INFO 	 batch_size: 10
2025-08-06 03:45:08,766 INFO 	 evaluator: <function default_evaluator at 0x000001F0D333D8A0>
2025-08-06 03:45:08,767 INFO 	 seed: 123456
2025-08-06 03:45:08,767 INFO 	 is_quick: False
2025-08-06 03:45:08,768 INFO 	 log_file: None
2025-08-06 03:45:08,769 INFO 	 format: %(asctime)s %(levelname)s %(message)s


In [65]:
# Run the simulator
sim.run()

2025-08-06 03:45:08,776 INFO 

2025-08-06 03:45:08,779 INFO Total Stats
2025-08-06 03:45:08,781 INFO {0: {'count': 314, 'sum': 152959, 'min': 2, 'max': 999, 'mean': 487.13057324840764, 'std': 279.18246006501136}, 1: {'count': 359, 'sum': 178115, 'min': 1, 'max': 999, 'mean': 496.142061281337, 'std': 275.7264285193785}}
2025-08-06 03:45:08,782 INFO 

2025-08-06 03:45:08,784 INFO Train/Test Split
2025-08-06 03:45:08,787 INFO Train size: 500
2025-08-06 03:45:08,787 INFO Test size: 500
2025-08-06 03:45:08,788 INFO 

2025-08-06 03:45:08,789 INFO Train/Test Scale
2025-08-06 03:45:08,792 INFO 

2025-08-06 03:45:08,794 INFO Train Stats
2025-08-06 03:45:08,794 INFO {0: {'count': 159, 'sum': 81582, 'min': 2, 'max': 999, 'mean': 513.0943396226415, 'std': 267.99839932998077}, 1: {'count': 172, 'sum': 85553, 'min': 1, 'max': 999, 'mean': 497.4011627906977, 'std': 270.8264624983046}}
2025-08-06 03:45:08,796 INFO 

2025-08-06 03:45:08,798 INFO Test Stats
2025-08-06 03:45:08,799 INFO {0: {'count': 155

In [66]:
# Save the results with a prefix
sim.save_results("my_results_")

AttributeError: 'Simulator' object has no attribute 'save_results'

In [None]:
# You can probe the fields of the simulator for other statisics
for mab_name, mab in sim.bandits:
    print(mab_name + "\n")

    # Since the simulation is online, print the 'total' stats
    print('Worst Case Scenario:', sim.bandit_to_arm_to_stats_min[mab_name]['total'])
    print('Average Case Scenario:', sim.bandit_to_arm_to_stats_avg[mab_name]['total'])
    print('Best Case Scenario:', sim.bandit_to_arm_to_stats_max[mab_name]['total'], "\n\n")

In [None]:
# Plot the average case results per every arm for each bandit
sim.plot(metric='avg', is_per_arm=True)