In [2]:
import numpy as np
import matplotlib.pyplot as plt
import copy
import seaborn as sns
import time

In [76]:
class BernoulliBandit(object):
    def __init__(self , k=5 , probas=None):
        self.k = k
        np.random.seed(int(time.time()))
        if probas == None:
            self.reward_probabilities = [ round(np.random.uniform(low=0.1 , high=0.9) , 2) 
                                         for i in range(k) ]
        else:
            self.reward_probabilities = probas
            
        self.best_probability = max(self.reward_probabilities)
    
    def print_bandit(self):
        print("Reward Probabilities: {}\nBest Probability: {}"
              .format(self.reward_probabilities, self.best_probability))
    
    def get_reward(self , i):
        if np.random.random() <= self.reward_probabilities[i]:
            return 1
        return 0

In [77]:
class EpsilonGreedy(object):
    def __init__(self, Bandit:BernoulliBandit , eps=1e-4):
        assert(isinstance(Bandit, BernoulliBandit))
        self.bandit = copy.deepcopy(Bandit)
        self.epsilon = eps
        self.action_val = np.zeros_like(self.bandit.reward_probabilities)
        self.action_count = np.zeros_like(self.bandit.reward_probabilities , dtype=np.int)
        self.t = 0
        
    def run_one_step(self):
        self.t+=1
        # choose an action
        if np.random.random() > self.epsilon:
            a_t = np.argmax(self.action_val)
        else:
            a_t = np.random.randint(low=0 , high=self.bandit.k)
        
        # get reward from the bandit
        r = self.bandit.get_reward(a_t)
        # update the action value
        self.action_val[a_t] = (self.action_val[a_t] * self.action_count[a_t] + r ) / (self.action_count[a_t] + 1)
        # update the number of times an action has been chosen
        self.action_count[a_t] = self.action_count[a_t] + 1

In [78]:
class UCB(object):
    def __init__(self, Bandit):
        assert(isinstance(Bandit , BernoulliBandit))
        self.bandit = copy.deepcopy(Bandit)
        self.action_val = np.zeros_like(self.bandit.reward_probabilities)
        self.action_count = np.zeros_like(self.bandit.reward_probabilities , dtype=np.int)
        self.t = 0
            
    def run_one_step(self):
        self.t+=1
        # choose an action
        a_t = max(range(self.bandit.k) , key = lambda x: self.action_val[x] 
                              + np.sqrt( (2*np.log(self.t)) / (self.action_count[x]+1) ))
        # get reward from the bandit
        r = self.bandit.get_reward(a_t)
        # update the action value
        self.action_val[a_t] = (self.action_val[a_t] * self.action_count[a_t] + r ) / (self.action_count[a_t] + 1)
        self.action_count[a_t] = self.action_count[a_t] + 1

In [79]:
Bandit = BernoulliBandit()
Bandit.print_bandit()
ucb = UCB(Bandit)

Reward Probabilities: [0.78, 0.8, 0.24, 0.2, 0.75]
Best Probability: 0.8


In [80]:
steps = 100000
while steps>0:
    steps-=1
    ucb.run_one_step()
    
print("Estimates: {}".format(ucb.action_val))
print("Action Count: {}".format(ucb.action_count))
print("Steps: {}".format(ucb.t))

Estimates: [0.78190389 0.80105496 0.20967742 0.10869565 0.75472358]
Action Count: [17396 76780    62    46  5716]
Steps: 100000
