In [1]:
import pandas as pd
import numpy as np
import os
import random

In [2]:
class Bandit:
    total_number = 0
    total_sum = 0
    def __init__(self, name, next_value, *args):
        self.name = name
        self.next_value = next_value
        self.args = args
        self.avg_reward = 0
        self.number_actions = 0
        
        
    def action(self):
        result = self.next_value(*self.args)
        self.number_actions += 1
        self.avg_reward = (self.avg_reward * (self.number_actions - 1) + result)/self.number_actions
        Bandit.total_number += 1
        Bandit.total_sum += result 

In [3]:
class BayesianBandit():
    total_number = 0
    total_sum = 0
    def __init__(self, name, next_value, *args):
        self.name = name
        self.next_value = next_value
        self.args = args
        self.number_actions = 0
        self.a = 1
        self.b = 1
        
    def action(self):
        result = self.next_value(*self.args)
        self.number_actions += 1
        self.a += result
        self.b += 1 - result
        BayesianBandit.total_number += 1
        BayesianBandit.total_sum += result 

In [4]:
def generator(n, p):
    return [1 * (np.random.rand() > p) for i in range(0, n)]

In [5]:
all_actions = [generator(1000, (i + 1)/10) for i in range(0, 10)]
bandits = []
i = 0
for bandit in all_actions:
    i += 1
    bandits.append(Bandit('bandit' + str(i), lambda x: next(x), iter(bandit)))

Epsilon greedy

In [6]:
actions = 0
while actions < 1000:
    try:
        if random.random() > 0.1:
            current_bandit = max(bandits, key=lambda item: item.avg_reward)
        else:
            current_bandit = random.choice(bandits)
        current_bandit.action()
        actions += 1
    except StopIteration:
        break

print('Finish!')
for bandit in bandits:
    print('Bandit {} was launched {} times'.format(bandit.name, bandit.number_actions))
print('Total reward = {}'.format(Bandit.total_sum))
print('Average reward = {}'.format(Bandit.total_sum/Bandit.total_number))

Finish!
Bandit bandit1 was launched 892 times
Bandit bandit2 was launched 10 times
Bandit bandit3 was launched 15 times
Bandit bandit4 was launched 13 times
Bandit bandit5 was launched 7 times
Bandit bandit6 was launched 12 times
Bandit bandit7 was launched 9 times
Bandit bandit8 was launched 14 times
Bandit bandit9 was launched 15 times
Bandit bandit10 was launched 13 times
Total reward = 856
Average reward = 0.856


Bayesian bandit

In [7]:
bandits = []
i = 0
for bandit in all_actions:
    i += 1
    bandits.append(BayesianBandit('bandit' + str(i), lambda x: next(x), iter(bandit)))

In [8]:
actions = 0
while actions < 1000:
    try:
        current_bandit = max(bandits, key=lambda item: np.random.beta(item.a, item.b))
        current_bandit.action()
        actions += 1
    except StopIteration:
        break
        
print('Finish!')
for bandit in bandits:
    print('Bandit {} was launched {} times'.format(bandit.name, bandit.number_actions))
print('Total reward = {}'.format(BayesianBandit.total_sum))
print('Average reward = {}'.format(BayesianBandit.total_sum/BayesianBandit.total_number))

Finish!
Bandit bandit1 was launched 880 times
Bandit bandit2 was launched 79 times
Bandit bandit3 was launched 16 times
Bandit bandit4 was launched 4 times
Bandit bandit5 was launched 9 times
Bandit bandit6 was launched 2 times
Bandit bandit7 was launched 2 times
Bandit bandit8 was launched 3 times
Bandit bandit9 was launched 3 times
Bandit bandit10 was launched 2 times
Total reward = 884
Average reward = 0.884


And of course UCB1 Bandit

In [9]:
Bandit.total_sum = 0
Bandit.total_number = 0
bandits = []
i = 0
for bandit in all_actions:
    i += 1
    bandits.append(Bandit('bandit' + str(i), lambda x: next(x), iter(bandit)))
    bandits[-1].action()

In [10]:
actions = 0
while actions < 1000:
    try:
        current_bandit = max(bandits, key=lambda item: item.avg_reward + np.sqrt(2 * np.log(Bandit.total_number)/item.number_actions))
        current_bandit.action()
        actions += 1
    except StopIteration:
        break
        
print('Finish!')
for bandit in bandits:
    print('Bandit {} was launched {} times'.format(bandit.name, bandit.number_actions))
print('Total reward = {}'.format(Bandit.total_sum))
print('Average reward = {}'.format(Bandit.total_sum/Bandit.total_number))

Finish!
Bandit bandit1 was launched 475 times
Bandit bandit2 was launched 144 times
Bandit bandit3 was launched 188 times
Bandit bandit4 was launched 46 times
Bandit bandit5 was launched 45 times
Bandit bandit6 was launched 33 times
Bandit bandit7 was launched 29 times
Bandit bandit8 was launched 22 times
Bandit bandit9 was launched 16 times
Bandit bandit10 was launched 12 times
Total reward = 776
Average reward = 0.7683168316831683


#### Conclusion:
Best algorihm is Bayesian algorihm, because it gives us the best outcome (884).
<dr>
Second is Epsilon greedy (856).
<dr>
And the last plaсe goes to UCB1 algorihm (776).