In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [121]:
class Bandit:
    def __init__(self, win_rate:float):
        self.win_rate = win_rate
        self.pull_count = 0
        self.average = 0
        self.pulls = []
        
    def pull(self) -> bool:
        rand = np.random.rand()
        if rand < self.win_rate:
            result = True
        else:
            result = False
        
        self.pull_count += 1

        if self.pull_count > 1:
            self.average = ((self.average * (self.pull_count-1))/(self.pull_count)) + int(result) / self.pull_count
        else:
            self.average = int(result)
            
        self.pulls.append(int(result))
        
        return result
    
    def sample(self, pull_sum:int=1):
        """
        Draws a sample on estimated parameters
        """
        if self.pull_count > 0:
            pulls = np.array(self.pulls)
            m0 = (np.sum(self.pulls) / (self.pull_count + 1))
            stdev = np.sqrt(np.sum(np.square(pulls - m0)))
            return np.random.normal(loc=m0, scale=(stdev/np.sqrt(self.pull_count) + 1e-7))
        else:
            return np.random.normal()
        

In [29]:
b = Bandit(0.5)

In [30]:
for i in range(100):
    b.pull()
print(b.average)

0.4599999999999999


In [71]:
def pickMachineUCB1(bandits:np.ndarray, n:int) -> Bandit:
    bandit = np.argmax([bandit.average + np.sqrt(2*np.log(n+1e-10) / (bandit.pull_count + 1e-10)) for bandit in bandits])
    return bandits[bandit]
    

def pickMachineEpsilonGreedy(bandits:np.ndarray, epsilon:float) -> Bandit:
    if np.random.rand() < epsilon:
        # by random
        choice = bandits[np.random.randint(0, len(bandits))]
    else:
        # pick best 
        choice = bandits[0]
        for bandit in bandits:
            if bandit.average > choice.average:
                choice = bandit
                
    return choice

def pickMachineThomson(bandits:np.ndarray):
    pulls = np.sum([b.pull_count for b in bandits])
    choice = np.argmax([b.sample(pull_sum=pulls) for b in bandits])
    return bandits[choice]


In [7]:
def epsilon_greedy_search():
    bandits = [Bandit(x) for x in np.arange(0, 1, 0.1)]
    epsilons = np.arange(0, 1, 0.02)
    for epsilon in epsilons:
        for i in range(10000):
            choice = pickMachineEpsilonGreedy(bandits=bandits, epsilon=epsilon)
            choice.pull()

        print("Epsilon: {}".format(epsilon), [x.pull_count for x in bandits])

In [6]:
epsilon_greedy_search()

Epsilon: 0.0 [10000, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Epsilon: 0.02 [10046, 19, 27, 17, 21, 14, 20, 44, 1344, 8448]
Epsilon: 0.04 [10094, 60, 68, 56, 53, 60, 70, 81, 1395, 18063]
Epsilon: 0.06 [10144, 102, 121, 121, 110, 109, 135, 142, 1442, 27574]
Epsilon: 0.08 [10214, 209, 191, 191, 194, 188, 207, 210, 1506, 36890]
Epsilon: 0.1 [10324, 307, 302, 291, 298, 271, 311, 298, 1615, 45983]
Epsilon: 0.12 [10461, 432, 419, 423, 413, 408, 425, 409, 1741, 54869]
Epsilon: 0.14 [10617, 557, 560, 560, 548, 541, 569, 539, 1883, 63626]
Epsilon: 0.16 [10778, 733, 720, 744, 710, 711, 736, 706, 2038, 72124]
Epsilon: 0.18 [10959, 907, 885, 940, 890, 898, 914, 885, 2228, 80494]
Epsilon: 0.2 [11159, 1103, 1079, 1127, 1087, 1098, 1113, 1091, 2439, 88704]
Epsilon: 0.22 [11390, 1329, 1341, 1326, 1315, 1312, 1327, 1312, 2660, 96688]
Epsilon: 0.24 [11622, 1578, 1583, 1575, 1553, 1518, 1581, 1520, 2878, 104592]
Epsilon: 0.26 [11870, 1832, 1836, 1849, 1786, 1756, 1834, 1795, 3118, 112324]
Epsilon: 0.28 [12127, 2083, 2

In [95]:
def UCB1():
    bandits = [Bandit(x) for x in np.arange(0.1, 1, 0.1)]
    for n in range(0, 10):
        bandit = pickMachineUCB1(bandits=bandits, n=n)
        bandit.pull()
        
    print([str(bandit.pull_count)+": {}/{}".format(bandit.average, bandit.win_rate) for bandit in bandits])


In [94]:
UCB1()

['1: 0/0.1', '2: 0.5/0.2', '1: 0/0.30000000000000004', '1: 1/0.4', '1: 1/0.5', '1: 1/0.6', '1: 0/0.7000000000000001', '1: 0/0.8', '1: 1/0.9']


  from ipykernel import kernelapp as app


In [126]:
def thomson():
    bandits = [Bandit(x) for x in np.arange(0.1, 1, 0.1)]
    for n in range(0, 100):
        bandit = pickMachineThomson(bandits=bandits)
        bandit.pull()
        
    print([str(bandit.pull_count)+": {:.2f}/{:.2f} +++ ".format(bandit.average, bandit.win_rate) for bandit in bandits])
 

In [127]:
thomson()

['7: 0.29/0.10 +++ ', '1: 0.00/0.20 +++ ', '13: 0.31/0.30 +++ ', '1: 0.00/0.40 +++ ', '21: 0.48/0.50 +++ ', '10: 0.60/0.60 +++ ', '18: 0.61/0.70 +++ ', '13: 0.77/0.80 +++ ', '16: 1.00/0.90 +++ ']
