In [1]:
## Multi-Armed Bandit
## Explore-Exploit

#https://www.analyticsvidhya.com/blog/2018/09/reinforcement-multi-armed-bandit-scratch-python/

In [2]:
#https://github.com/Heewon-Hailey/multi-armed-bandits-for-recommendation-systems/blob/main/MABs.ipynb
#https://github.com/fidelity/mab2rec/blob/main/requirements.txt
#https://github.com/fidelity/mab2rec/tree/main/data - for learning purpose

In [3]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [4]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from pandas.core.arrays.period import delta_to_tick
import math

In [5]:
path = "/content/drive/MyDrive/Colab Notebooks/ds_projects/data/ads_optimisation.csv"
dataset = pd.read_csv(path)

In [6]:
dataset.head(3)

Unnamed: 0,Ad 1,Ad 2,Ad 3,Ad 4,Ad 5,Ad 6,Ad 7,Ad 8,Ad 9,Ad 10
0,1,0,0,0,1,0,0,0,1,0
1,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0


In [7]:
dataset.describe()

Unnamed: 0,Ad 1,Ad 2,Ad 3,Ad 4,Ad 5,Ad 6,Ad 7,Ad 8,Ad 9,Ad 10
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.1703,0.1295,0.0728,0.1196,0.2695,0.0126,0.1112,0.2091,0.0952,0.0489
std,0.375915,0.335769,0.259821,0.324509,0.443722,0.111546,0.314395,0.406686,0.293506,0.21567
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Random Selection Algorithm -> Randomly show the ad to user and check for reward

In [8]:
import random

In [9]:
N = dataset.shape[0]
d = 10
selected_ads = []
total_reward = 0

dataset_values = dataset.values
for n in range(0, N):
  ad = random.randrange(d)
  selected_ads.append(ad)
  reward = dataset_values[n, ad]
  total_reward += reward

In [10]:
total_reward

1247

In [11]:
pd.Series(selected_ads).tail(1000).value_counts(normalize=True)

4    0.115
2    0.112
7    0.109
8    0.102
0    0.100
9    0.098
3    0.095
1    0.091
6    0.089
5    0.089
dtype: float64

In [12]:
pd.Series(selected_ads).head(1000).value_counts(normalize=True)

4    0.121
7    0.112
9    0.104
5    0.103
3    0.101
0    0.099
1    0.095
2    0.090
8    0.088
6    0.087
dtype: float64

## Upper Confidence Bound Algorithm - UCB
1. Step 1 assign 0 to all d
2. now get rewards for all ads by iterating data
3. in the end you will have more rewards for the ad which has more reward in the data

In [13]:
from pandas.core.arrays.period import delta_to_tick
import math
N = dataset.shape[0]
d = 10
selected_ads_ucb = []
numbers_of_selections = [0] * d

rewards_across_d = [0] * d
total_reward_ucb = 0
dataset_values = dataset.values

for n in range(0, N):
  ad = 0
  maxium_upper_bound = 0
  for i in range(0, d):
    if numbers_of_selections[i] > 0:
      ## this runs 1 for all ads
      average_reward_d = rewards_across_d[i] / numbers_of_selections[i]
      delta_d = math.sqrt(2 * math.log(n+1) / numbers_of_selections[i])
      upper_bound = average_reward_d + delta_d
    else:
      # this happens for all selections atleast once and then it will not come here
      upper_bound = 1e400
    if upper_bound > maxium_upper_bound:
      # print(f"upper_bound: {upper_bound}, maxim_upper_bound: {maxium_upper_bound}")
      maxium_upper_bound = upper_bound
      ad = i
      # print(f"i: {i}")

  selected_ads_ucb.append(ad)
  numbers_of_selections[ad] += 1
  reward = dataset_values[n, ad]
  rewards_across_d[ad] += reward
  total_reward += reward

  # print("----")
  # print(f"numbers_of_selection: {numbers_of_selections}")
  # print(upper_bound)
  # print(f"ad: {ad}")
  # print(f"rewards_across_d: {rewards_across_d}")
  # if n == 20:
  #   break

In [14]:
numbers_of_selections

[947, 417, 338, 380, 5630, 180, 435, 1106, 352, 215]

In [15]:
rewards_across_d

[176, 48, 31, 40, 1519, 1, 52, 217, 34, 7]

In [16]:
dataset.describe()

Unnamed: 0,Ad 1,Ad 2,Ad 3,Ad 4,Ad 5,Ad 6,Ad 7,Ad 8,Ad 9,Ad 10
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.1703,0.1295,0.0728,0.1196,0.2695,0.0126,0.1112,0.2091,0.0952,0.0489
std,0.375915,0.335769,0.259821,0.324509,0.443722,0.111546,0.314395,0.406686,0.293506,0.21567
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
total_reward

3372

In [18]:
pd.Series(selected_ads_ucb).head(1000).value_counts(normalize=True)

7    0.192
4    0.153
0    0.115
8    0.096
3    0.093
1    0.080
6    0.080
2    0.077
5    0.059
9    0.055
dtype: float64

In [19]:
pd.Series(selected_ads_ucb).tail(1000).value_counts(normalize=True)

4    0.771
0    0.106
7    0.034
3    0.034
2    0.026
1    0.007
6    0.007
8    0.006
9    0.005
5    0.004
dtype: float64

Implementation of Multi-Armed Bandit for Recommender Systems
1. This does not have typical RL components like State. Because Agent is not interacting with the system in sequential steps manner and context of the environment is already encoded in the features (context_features+user_features).

Example Multi-Armed Bandit

In [22]:
train_data_path = "/content/drive/MyDrive/Colab Notebooks/ds_projects/data/rl/data_train.csv"
test_data_path = "/content/drive/MyDrive/Colab Notebooks/ds_projects/data/rl/data_test.csv"
user_features_path = "/content/drive/MyDrive/Colab Notebooks/ds_projects/data/rl/features_user.csv"
item_features_path = "/content/drive/MyDrive/Colab Notebooks/ds_projects/data/rl/features_item.csv"

In [23]:
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
user_features = pd.read_csv(user_features_path)
item_features = pd.read_csv(item_features_path)

In [24]:
train_data.head(3)

Unnamed: 0,user_id,item_id,response
0,843,427,0
1,144,173,1
2,601,250,0


In [25]:
user_features.head(3)

Unnamed: 0,user_id,u0,u1,u2,u3,u4,u5,u6,u7,u8,...,u22,u23,u24,u25,u26,u27,u28,u29,u30,u31
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,2,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
2,3,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [26]:
item_features.head(3)

Unnamed: 0,item_id,i0,i1,i2,i3,i4,i5,i6,i7,i8,i9,i10,i11,i12,i13,i14,i15,i16,i17,i18
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
2,7,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0


In [27]:
train_data.response.value_counts()

0    25339
1     9204
Name: response, dtype: int64

In [28]:
train_data.user_id.nunique()

896

In [29]:
train_data.item_id.nunique()

201

## Join train with user features
- not required for UCB model

## UCB Agent

In [30]:
class UCBModel:
  def __init__(self, num_arms):
    self.num_arms = num_arms
    self.rewards = [0] * num_arms # for all arams initialize 0 reward
    self.counts = [0] * num_arms
    self.timestep = 0 # total number of iterations (train data size)

  def choose_arm(self):
    if 0 in self.counts:
      return self.counts.index(0)

    ucb_arm_values = [] # compute ucb value for each arm
    for index in range(self.num_arms):
      arm_reward_value = self.rewards[index]
      average_arm_reward_value = arm_reward_value / self.counts[index]
      delta_for_arm = np.sqrt(2 * np.log(self.timestep) / self.counts[index])
      ucb_arm_value = average_arm_reward_value + delta_for_arm
      ucb_arm_values.append(ucb_arm_value)

    return np.argmax(ucb_arm_values) # return the arm with higher arm value

  def update(self, arm, reward):
    self.timestep += 1
    self.rewards[arm] += reward
    self.counts[arm] += 1

In [31]:
num_arms = train_data.item_id.nunique()
banditModel = UCBModel(num_arms)

In [32]:
def train_ucb_model(model, train_data, test_data, epochs=2):
  train_rewards = []
  test_rewards = []

  for epoch in range(epochs):
    print(f"epoch: {epoch}")
    for ind, row in train_data.iterrows():
      arm = model.choose_arm()
      reward = row["response"]
      model.update(arm , reward)

    train_rewards.append(np.mean([train_data[train_data['user_id'] == user_id]['response'].max() for user_id in train_data['user_id'].unique()]))
    test_rewards.append(np.mean([test_data[test_data['user_id'] == user_id]['response'].max() for user_id in test_data['user_id'].unique()]))

  return model, train_rewards, test_rewards

In [33]:
banditModel, train_rewards, test_rewards = train_ucb_model(banditModel,
                                                           train_data, test_data, 4)

epoch: 0
epoch: 1
epoch: 2
epoch: 3


In [34]:
test_rewards

[0.9017857142857143,
 0.9017857142857143,
 0.9017857142857143,
 0.9017857142857143]

In [35]:
banditModel.choose_arm()

31

In [36]:
banditModel.choose_arm()

31

In [37]:
banditModel.choose_arm()

31

## UCB can be used only for Popular Items since it does not change for different customers


# Epsilon Greedy
Similar to UCB but also choose random arm

In [38]:
class EpsilonGreedyUCBModel:
  def __init__(self, num_arms):
    self.num_arms = num_arms
    self.rewards = [0] * num_arms # for all arams initialize 0 reward
    self.counts = [0] * num_arms
    self.timestep = 0 # total number of iterations (train data size)
    self.epsilon = 0.1

  def choose_arm(self):
    # Whenever random generator number is below 0.1 it will choose an arm randomly
    if random.random() < self.epsilon:
      return random.randint(0, self.num_arms - 1)
    else:
      if 0 in self.counts:
        return self.counts.index(0)

      ucb_arm_values = [] # compute ucb value for each arm
      for index in range(self.num_arms):
        arm_reward_value = self.rewards[index]
        average_arm_reward_value = arm_reward_value / self.counts[index]
        delta_for_arm = np.sqrt(2 * np.log(self.timestep) / self.counts[index])
        ucb_arm_value = average_arm_reward_value + delta_for_arm
        ucb_arm_values.append(ucb_arm_value)

      return np.argmax(ucb_arm_values) # return the arm with higher arm value

  def update(self, arm, reward):
    self.timestep += 1
    self.rewards[arm] += reward
    self.counts[arm] += 1

In [39]:
epsilonGreedybanditModel = EpsilonGreedyUCBModel(num_arms)

epsilonGreedybanditModel, train_rewards, test_rewards = train_ucb_model(epsilonGreedybanditModel,
              train_data, test_data, 2)

epoch: 0
epoch: 1


In [40]:
epsilonGreedybanditModel.epsilon

0.1

In [41]:
for i in range(10):
  print(epsilonGreedybanditModel.choose_arm())

183
183
183
183
183
183
183
183
183
183


In [42]:
for i in range(10):
  print(epsilonGreedybanditModel.choose_arm())

183
183
183
183
183
183
86
183
183
183


In [43]:
epsilonGreedybanditModel.epsilon = 0.7

In [44]:
for i in range(10):
  print(epsilonGreedybanditModel.choose_arm())

196
167
98
190
42
48
128
45
179
183


Using Epsilon Greedy - Randomization can be added to any bandit Algo
* Higher epsilon - higher probability of choosing random arm apart from popular arm

# LinearUCB

##### Contextual bandit algorithm -> Meaning, can be used for personalization, each user will have different recommendations

In [45]:
train_data

Unnamed: 0,user_id,item_id,response
0,843,427,0
1,144,173,1
2,601,250,0
3,751,751,0
4,201,275,0
...,...,...,...
34538,883,13,0
34539,852,597,0
34540,339,133,0
34541,493,318,1


In [46]:
train_data_with_features = train_data.merge(user_features, left_on="user_id", right_on="user_id")
test_data_with_features = test_data.merge(user_features, left_on="user_id", right_on="user_id")

In [47]:
train_data_with_features.head(2)

Unnamed: 0,user_id,item_id,response,u0,u1,u2,u3,u4,u5,u6,...,u22,u23,u24,u25,u26,u27,u28,u29,u30,u31
0,843,427,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,843,252,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
class LinearUCBModel:
  def __init__(self, num_arms, num_features, alpha=1.0):
    self.num_arms = num_arms
    self.num_features = num_features
    self.alpha = alpha

    self.A = [np.identity(num_features) for _ in range(num_arms)]
    self.b = [np.zeros((num_features, 1)) for _ in range(num_arms)]
    self.theta = [np.zeros((num_features, 1)) for _ in range(num_arms)]

  def choose_arm(self, user_features):
    p_values = []
    for arm in range(self.num_arms):
      p_values.append(self.predict_reward(user_features, arm))
    return np.argmax(p_values)

  def predict_reward(self, user_features, arm):
    A_inverse = np.linalg.inv(self.A[arm])
    theta_arm = self.theta[arm]
    x = user_features.reshape((-1, 1))
    mean_reward = theta_arm.T.dot(x) # dot product between user and item
    uncertainity = self.alpha * np.sqrt(x.T.dot(A_inverse).dot(x))

    return mean_reward + uncertainity

  def update(self, user_features, arm, reward):
    x = user_features.reshape((-1, 1))
    self.A[arm] += x.dot(x.T)
    self.b[arm] += reward * x
    self.theta[arm] = np.linalg.inv(self.A[arm]).dot(self.b[arm])

In [49]:
num_features = len(train_data_with_features.columns[3:])
features = train_data_with_features.columns[3:]
model = LinearUCBModel(num_arms, num_features)

In [50]:
train_rewards = []
test_rewards = []
epochs = 1

for epoch in range(epochs):
  print(f"epoch: {epoch}")
  for ind, row in train_data_with_features.iterrows():
    if ind % 200 == 0:
      print(ind)
    user_features = np.array(row[features])
    arm = model.choose_arm(user_features)
    reward = row["response"]
    model.update(user_features, arm, reward)

  train_rewards.append(np.mean([train_data[train_data['user_id'] == user_id]['response'].max() for user_id in train_data['user_id'].unique()]))
  test_rewards.append(np.mean([test_data[test_data['user_id'] == user_id]['response'].max() for user_id in test_data['user_id'].unique()]))


epoch: 0
0
200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800
4000
4200
4400
4600
4800
5000
5200
5400
5600
5800
6000
6200
6400
6600
6800
7000
7200
7400
7600
7800
8000
8200
8400
8600
8800
9000
9200
9400
9600
9800
10000
10200
10400
10600
10800
11000
11200
11400
11600
11800
12000
12200
12400
12600
12800
13000
13200
13400
13600
13800
14000
14200
14400
14600
14800
15000
15200
15400
15600
15800
16000
16200
16400
16600
16800
17000
17200
17400
17600
17800
18000
18200
18400
18600
18800
19000
19200
19400
19600
19800
20000
20200
20400
20600
20800
21000
21200
21400
21600
21800
22000
22200
22400
22600
22800
23000
23200
23400
23600
23800
24000
24200
24400
24600
24800
25000
25200
25400
25600
25800
26000
26200
26400
26600
26800
27000
27200
27400
27600
27800
28000
28200
28400
28600
28800
29000
29200
29400
29600
29800
30000
30200
30400
30600
30800
31000
31200
31400
31600
31800
32000
32200
32400
32600
32800
33000
33200
33400
33600
33800
34000
34200
34400


In [52]:
model.choose_arm(np.array(train_data_with_features.iloc[3][features]))

197

In [59]:
model.choose_arm(np.array(train_data_with_features.iloc[34538][features]))

179

In [60]:
model.choose_arm(np.array(train_data_with_features.iloc[34542][features]))

49

## Thompson Sampling Implementation
1. This is not based on context
2. "Thompson Sampling with User Context" model will provide Personalization

* Idea: Will sample from beta distribution and parameters of beta distribution for each arm will be obtained through training process.
* For any given point, for all arms you sample from the learnt beta distribution and you chose the arm which has high samples.
* alpha and beta values are learnt through rewards. Meaning if an arm is succesful it will have more alpha compared to less succesful arm. If an arm is successful, it will samples more values closer towards 1.0 compared to less succesful arm (sampling will be closer to 0)
* alpha and beta parameters mainly represent success probability for each arm

In [92]:
## Beta distribution understanding

# if alpha > beta -> higher density towards 1.0
# if beta > alpha -> higher density towards 0.0

# It means if an arm has more positive (alpha) compared to negative feedback (beta)
# the sampleing from beta will be close to 1.0

In [88]:
class ThompsonSamplingModel:
  def __init__(self, num_arms):
    self.num_arms = num_arms
    self.alpha = np.ones(num_arms) # this will hold positive feedback for an arm
    self.beta = np.ones(num_arms) # this will hold negative feedback for an arm

  def choose_arm(self):
    ## for each arm you sample from the beta distribution -> each arm will have
      # its own beta distribution parameters
    samples_for_arms = []
    for arm in range(self.num_arms):
      sample_for_arm = np.random.beta(self.alpha[arm], self.beta[arm])
      samples_for_arms.append(sample_for_arm)
    return np.argmax(samples_for_arms)

  def update(self, arm, reward):
    self.alpha[arm] += reward
    self.beta[arm] += (1 - reward)

In [89]:
num_arms = train_data.item_id.nunique()
banditThompsonSampled = ThompsonSamplingModel(num_arms)

In [90]:
epochs = 1
for epoch in range(epochs):
  print(f"epoch: {epoch}")
  for ind, row in train_data.iterrows():
    if ind % 5000 == 0:
      print(ind)
    arm = banditThompsonSampled.choose_arm()
    reward = row["response"]
    banditThompsonSampled.update(arm , reward)

epoch: 0
0
5000
10000
15000
20000
25000
30000


In [93]:
print(banditThompsonSampled.choose_arm())

30


In [94]:
print(banditThompsonSampled.choose_arm())

128


In [95]:
print(banditThompsonSampled.choose_arm())

144


In [96]:
print(banditThompsonSampled.choose_arm())

10


In [97]:
print(banditThompsonSampled.choose_arm())

47


In [98]:
print(banditThompsonSampled.choose_arm())

83


In [99]:
banditThompsonSampled.alpha[0:4]

array([78., 30., 75., 21.])

In [100]:
banditThompsonSampled.beta[0:4]

array([196.,  95., 200.,  70.])

# Thomposon Sampling with User Context Features

* Extra thing will be -> alpha and beta parameters will be learned "for each arm, feature"

In [119]:
class ThompsonSamplingModelWithUserContext:
  def __init__(self, num_arms, num_features):
    self.num_arms = num_arms
    self.num_features = num_features
    self.alpha = np.ones((num_arms, num_features))
    self.beta = np.ones((num_arms, num_features))

  def choose_arm(self, user_features):
    """
    Example
      1 arm 2 features -> 2 alpha and 2 betas
      beta_dist = a = np.array([0.96029728, 0.90110295])
      user_feat = b = np.array([10, 24])
      similarity_beta_and_user = a @ b = 31.2294436
    """
    samples_for_arms_and_features = []
    for arm in range(self.num_arms):
      sample_for_arm_for_each_feature = np.random.beta(self.alpha[arm], self.beta[arm])
      sample_arm_feature_and_user_feature_similarity = sample_for_arm_for_each_feature @ user_features # dot product
      samples_for_arms_and_features.append(sample_arm_feature_and_user_feature_similarity)
    return np.argmax(samples_for_arms_and_features)

  def update(self, arm, user_features, reward):
    self.alpha[arm] += user_features * reward
    self.beta[arm] += user_features * (1-reward) # if reward is 1, it will be 0

In [112]:
a = np.array([0.96029728, 0.90110295])

In [113]:
b = np.array([10, 24])

In [114]:
a @ b

31.2294436

In [118]:
b * 0

array([0, 0])

In [111]:
np.random.beta([1, 2], [0.5, 1])

array([0.96029728, 0.90110295])

In [120]:
num_features = len(train_data_with_features.columns[3:])
features = train_data_with_features.columns[3:]
thompsonWithContextModel = ThompsonSamplingModelWithUserContext(num_arms, num_features)

In [121]:
epochs = 1

for epoch in range(epochs):
  print(f"epoch: {epoch}")
  for ind, row in train_data_with_features.iterrows():
    if ind % 2000 == 0:
      print(ind)
    user_features = np.array(row[features])
    arm = thompsonWithContextModel.choose_arm(user_features)
    reward = row["response"]
    thompsonWithContextModel.update(user_features, arm, reward)

epoch: 0
0
2000
4000
6000
8000
10000
12000
14000
16000
18000
20000
22000
24000
26000
28000
30000
32000
34000


Different Users

In [122]:
thompsonWithContextModel.choose_arm(np.array(train_data_with_features.iloc[3][features]))

163

In [123]:
thompsonWithContextModel.choose_arm(np.array(train_data_with_features.iloc[3540][features]))

161

Same User but different recommendation

In [125]:
thompsonWithContextModel.choose_arm(np.array(
    train_data_with_features.iloc[34538][features]))

5

In [126]:
thompsonWithContextModel.choose_arm(np.array(
    train_data_with_features.iloc[34538][features]))

80

In [127]:
thompsonWithContextModel.choose_arm(np.array(
    train_data_with_features.iloc[34538][features]))

76

In [128]:
thompsonWithContextModel.choose_arm(np.array(
    train_data_with_features.iloc[34538][features]))

128

In [129]:
thompsonWithContextModel.choose_arm(np.array(
    train_data_with_features.iloc[34538][features]))

148