### Simulation of multi-armed bandits problem as describe in: [Reinforcement Learning book by Richard Sutton](http://incompleteideas.net/sutton/book/the-book-2nd.html)

In [186]:
from pprint import pprint as pp
from matplotlib import pyplot as plt

from bokeh.charts import Line, output_file, show
from bokeh.io import output_notebook
from bokeh.charts.attributes import ColorAttr, CatAttr
from bokeh.sampledata.autompg import autompg as df

import multiprocessing


%matplotlib inline
output_notebook()

In [189]:
N_PROBLEM = 100
N_STEP = 2000
K = 10

In [190]:
def e_greedy(rewards, epsilon=0, steps=N_STEP):
    actual_rewards = []
    optimal_rewards = []
    times_sampled = [0] * K
    reward_estimates = [0] * K
    for step in range(N_STEP):
        max_reward = max(reward_estimates)
        greedy_machines = [i for i, r in enumerate(reward_estimates) if r == max_reward]
        explore_machines = [i for i, r in enumerate(reward_estimates) if r != max_reward]  
        
        # explore at epsilon probability 
        if np.random.choice([True, False], p=[epsilon, 1-epsilon]) and len(explore_machines) > 1:
            machine_chosen = np.random.choice(explore_machines)
        else: 
            machine_chosen = np.random.choice(greedy_machines)

        actual_reward = np.random.normal(rewards[machine_chosen], 1)
        times_sampled[machine_chosen] += 1
        
        #update reward estimate with sample average
        reward_estimates[machine_chosen] = (reward_estimates[machine_chosen] * (times_sampled[machine_chosen] - 1)
                                            + actual_reward)/(times_sampled[machine_chosen])
        actual_rewards.append(actual_reward)
        
        optimal_rewards.append(np.random.normal(max(rewards), 1))

    return actual_rewards, optimal_rewards, reward_estimates

In [191]:
explore_rates = [0, 0.01, 0.1]
results = {}
for e in explore_rates:
    sum_actual_rewards = np.array([0.0] * N_STEP)
    sum_optimal_rewards = np.array([0.0] * N_STEP)
        
    for problem in range(N_PROBLEM):
        actual_rewards, optimal_rewards, value_estimates = e_greedy(np.random.normal(0, 1, K), epsilon=e)
        sum_actual_rewards += actual_rewards
        sum_optimal_rewards += optimal_rewards
    
    results["epsilon = {0}".format(e)] = np.cumsum(sum_actual_rewards)/np.cumsum(sum_optimal_rewards)

In [192]:
line = Line(results, legend="bottom_right", ylabel='% optimal action', xlabel="steps", 
            plot_width=500, plot_height=300)
show(line)