In [1]:
import numpy as np
import pandas as pd

np.random.seed(0)

## Multi Armed Bandit

In [2]:
class MAB:
    
    def __init__(self, n_levers=5):
        self.n_levers = 5
        
    def assign_threshold(self, good_levers=[0,1], medium_levers=[2], bad_levers=[3,4]):
        """
        Simple rules:
        good levers => 70% of winning
        medium levers => 50% of winning
        bad levers => 20% of winning
        
        reward : 0 or 1
        """
        all_levers = good_levers+medium_levers+bad_levers
        all_levers = sorted(all_levers)
        if all_levers != list(range(self.n_levers)):
            raise ValueError("Please enter the proper lever numbers...")
       
        good_threshold = 0.7
        medium_threshold = 0.5
        bad_threshold = 0.2
        
        self.lever_threshold = {}
        for lever in good_levers:
            self.lever_threshold[lever] = good_threshold
        for lever in medium_levers:
            self.lever_threshold[lever] = medium_threshold
        for lever in bad_levers:
            self.lever_threshold[lever] = bad_threshold
            
    def play(self, lever):
        if lever not in range(self.n_levers):
            raise ValueError("Please select the valid lever number...")
        random_n = np.random.random()
        reward = int(self.lever_threshold[lever] >= np.random.random())
        return reward

In [3]:
mab = MAB(n_levers=5)
mab.assign_threshold(good_levers=[0,1], medium_levers=[2], bad_levers=[3,4])

In [4]:
# check mab
total_rewards = {}
count = {}
average_rewards = {}
for i in range(5):
    total_rewards[i] = 0
    count[i] = 0
    
for _ in range(1000):
    action = np.random.choice(5)
    reward = mab.play(lever=action)
    total_rewards[action] += reward
    count[action] += 1
    
for i in range(5):
    average_rewards[i] = total_rewards[i] / count[i]
    
print(average_rewards) 

{0: 0.6650717703349283, 1: 0.6954314720812182, 2: 0.5025906735751295, 3: 0.21226415094339623, 4: 0.19047619047619047}


## Agent

In [5]:
class Agent:
    
    def __init__(self, mab, init_n=5, epsilon=0.1):
        """
        mab -> Multi armed bandit
        init_n -> number of plays for each action for its Q value
        Q -> action-value function
        N -> number of time action was taken
        """
        self.mab = mab
        self.epsilon = epsilon
        self.Q = {}
        self.N = {}
        self.total_reward = 0
        self.t = mab.n_levers * init_n # for ucb
        self.ucb = {} # for ucb
        self.c = 0.1 # for ucb 
        for i in range(mab.n_levers):
            self.N[i] = init_n
            self.Q[i] = 0
            for _ in range(init_n):
                reward = mab.play(lever=i)
                self.Q[i] += reward
                self.total_reward += reward
            self.Q[i] /= init_n
            self.ucb[i] = self.Q[i] + self.c * np.sqrt(np.log(self.t)/self.N[i])
            
    def get_random_action(self):
        return np.random.choice(self.mab.n_levers)
    
    def get_greedy_action(self):
        return max(self.Q, key = lambda action : self.Q[action])
    
    def select_action_by_epsilon_greedy(self):
        random_n = np.random.random()
        if random_n < self.epsilon:
            action = self.get_random_action()
        else:
            action = self.get_greedy_action()
        self.N[action] += 1
        return action
    
    def select_action_by_upper_confidence_bound(self):
        
        action = max(self.ucb, key = lambda action : self.ucb[action])
        self.t += 1
        self.N[action] += 1
        return action
    
    def train(self, t=1000, method="epsilon_greedy"):
        for _ in range(t):
            if method == "epsilon_greedy":
                action = self.select_action_by_epsilon_greedy()
            elif method == "ucb":
                action = self.select_action_by_upper_confidence_bound()
            else:
                raise ValueError("Please select the valid action method")
            reward = mab.play(lever=action)
            alpha = 1 / self.N[action]
            self.Q[action] += alpha * (reward - self.Q[action])
            if method == "ucb":
                self.ucb[action] = self.Q[action] + self.c * np.sqrt(np.log(self.t)/self.N[i])
            self.total_reward += reward
        return self.total_reward

## UCB

In [6]:
agent = Agent(mab, init_n=5, epsilon=0)
total_reward = agent.train(t=10**7, method="ucb")
print(total_reward)
print(agent.Q)

7000679
{0: 0.7000669599865862, 1: 0.5, 2: 0.2, 3: 0.2, 4: 0.4}


## Epsilon Greedy

In [8]:
agent = Agent(mab, init_n=5, epsilon=0)
total_reward = agent.train(t=10**6)
print(total_reward)
print(agent.Q)

700525
{0: 0.7005142994856899, 1: 0.4, 2: 0.5555555555555556, 3: 0.2, 4: 0.4}


In [7]:
agent = Agent(mab, init_n=5, epsilon=0.001)
total_reward = agent.train(t=10**6)
print(total_reward)
print(agent.Q)

701088
{0: 0.7012986493086394, 1: 0.6619047619047616, 2: 0.5323383084577115, 3: 0.2918660287081338, 4: 0.23004694835680756}


In [9]:
agent = Agent(mab, init_n=5, epsilon=0.01)
total_reward = agent.train(t=10**6)
print(total_reward)
print(agent.Q)

696410
{0: 0.6988195298722967, 1: 0.691694523013628, 2: 0.5037821482602125, 3: 0.19290354822588707, 4: 0.20804710500490642}


In [10]:
agent = Agent(mab, init_n=5, epsilon=0.25)
total_reward = agent.train(t=10**6)
print(total_reward)
print(agent.Q)

639195
{0: 0.6947121175620358, 1: 0.6994309699744669, 2: 0.49591873752947585, 3: 0.19952014395681264, 4: 0.20203209837112562}
