### Simulation of multi-armed bandits problem as describe in: [Reinforcement Learning book by Richard Sutton](http://incompleteideas.net/sutton/book/the-book-2nd.html)

In [195]:
from pprint import pprint as pp
from matplotlib import pyplot as plt

from bokeh.charts import Line, output_file, show
from bokeh.io import output_notebook
from bokeh.charts.attributes import ColorAttr, CatAttr
from bokeh.sampledata.autompg import autompg as df
from bokeh.plotting import figure

import multiprocessing


%matplotlib inline
output_notebook()

In [200]:
N_PROBLEM = 100
N_STEP = 2000
K = 10

In [23]:
def e_greedy(rewards, epsilon=0, steps=N_STEP, initial_value=0, alpha=None):
    actual_rewards = []
    optimal_rewards = []
    times_sampled = [0] * K
    reward_estimates = [initial_value] * K
    for step in range(steps):
        max_reward = max(reward_estimates)
        greedy_machines = [i for i, r in enumerate(reward_estimates) if r == max_reward]
        explore_machines = [i for i, r in enumerate(reward_estimates) if r != max_reward]  
        
        # explore at epsilon probability 
        if np.random.choice([True, False], p=[epsilon, 1-epsilon]) and len(explore_machines) > 1:
            machine_chosen = np.random.choice(explore_machines)
        else: 
            machine_chosen = np.random.choice(greedy_machines)

        actual_reward = np.random.normal(rewards[machine_chosen], 1)
        times_sampled[machine_chosen] += 1
        
        #update reward estimate
        if not alpha:
            reward_estimates[machine_chosen] = (
                reward_estimates[machine_chosen] * (times_sampled[machine_chosen] - 1)
                + actual_reward)/(times_sampled[machine_chosen])
        else:
            reward_estimates[machine_chosen] += alpha * (actual_reward - reward_estimates[machine_chosen])
        actual_rewards.append(actual_reward)
        optimal_rewards.append(np.random.normal(max(rewards), 1))
    return actual_rewards, optimal_rewards, reward_estimates


def run_simulation(epsilon=0, n_step=N_STEP, n_problem=N_PROBLEM, k=K, init_value=0, alpha=None):
    sum_actual_rewards = np.array([0.0] * n_step)
    sum_optimal_rewards = np.array([0.0] * n_step)
        
    for problem in range(n_problem):
        actual_rewards, optimal_rewards, value_estimates = e_greedy(np.random.normal(0, 1, k), 
                                                                    epsilon=epsilon, 
                                                                    initial_value=init_value,
                                                                    steps=n_step)
        sum_actual_rewards += actual_rewards
        sum_optimal_rewards += optimal_rewards
    result = np.cumsum(sum_actual_rewards)/np.cumsum(sum_optimal_rewards)
    return result

In [20]:
explore_rates = [0, 0.01, 0.1]
results = {}
for e in explore_rates:
    results["epsilon = {0}".format(e)] = run_simulation(epsilon=e, init_value=0)

p = Line(results, legend="bottom_right", ylabel='% optimal action', xlabel="steps", 
            plot_width=500, plot_height=300, title="simple sample average")
show(p)

In [24]:
explore_rates = [0, 0.01, 0.1]
results = {}
for e in explore_rates:
    results["epsilon = {0}".format(e)] = run_simulation(epsilon=e, init_value=0, alpha=0.1)

p = Line(results, legend="bottom_right", ylabel='% optimal action', xlabel="steps", 
            plot_width=500, plot_height=300, title="recently weighted sample average with constant alpha")
show(p)

### Optimistic initial value of +5 instead of 0 encourages exploration

In [31]:
results = {}
results["optimistic, explore = 0"] = run_simulation(epsilon=0, init_value=5, n_step=500)
results["realistic, explore = 0.1"] = run_simulation(epsilon=0.1, init_value=0, n_step=500)

p = Line(results, legend="bottom_right", ylabel='% optimal action', xlabel="steps", 
            plot_width=500, plot_height=300)
show(p)