# Multi-armed bandits
Aplique el algoritmo bandit ε−greedy con 
 
 - ε= 0 (greedy)
 - ε= 0.01
 - ε= 0.1 
 
A un problema k-armed bandit con k= 10 acciones.

Considere recompensas con medias aleatorias y desvío estándar constante σ.

Analice experimentalmente el efecto del desvío estándar σ evaluando tres casos:
- ε= 0 (determinístico)
- ε= 1
- ε= 10

¿Qué conclusiones puede sacar?

In [None]:
import itertools

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from armed_bandits import EpsilonGreedyBandit

sns.set_context("notebook", font_scale=2)
plt.rcParams["text.usetex"] = True

In [None]:
bandits = {}
rewards_per_sigma = {}
rewards_in_time = {}

In [None]:
k = 10
reps = 20_000
sigma_list = [0, 1, 10]
epsilon_list = [0, 0.01, 0.1]

In [None]:
for sigma in sigma_list:
    np.random.seed(2 * sigma)
    means = np.random.normal(0, 1, k)
    rewards_per_sigma[sigma] = np.random.normal(means, sigma, k)

In [None]:
for sigma, epsilon in itertools.product(*[sigma_list, epsilon_list]):
    np.random.seed(101)
    rewards = rewards_per_sigma[sigma]

    bandit = EpsilonGreedyBandit(k, epsilon)

    for _ in range(reps):
        arm = bandit.select_arm()
        reward = rewards[arm]
        bandit.update(arm, reward)
        if rewards_in_time.get((sigma, epsilon)) is None:
            rewards_in_time[(sigma, epsilon)] = []
        rewards_in_time[(sigma, epsilon)].append(reward)

    bandits[(sigma, epsilon)] = bandit

In [None]:
for key, bandit in bandits.items():
    sigma, epsilon = key

    plt.figure()
    sns.barplot(x=range(k), y=bandit.selected_arm_counts, hue=range(k))
    plt.xlim(-1, k)
    plt.ylim(0, int(reps * 1.01))
    plt.xlabel("Brazo")
    plt.ylabel("Veces seleccionado")

    plt.legend([], [], frameon=False)
    plt.savefig(f"../img/arm_sigma_{sigma}_epsilon_{epsilon}.png", transparent=True, bbox_inches="tight")
    plt.close()

In [None]:
for sigma in sigma_list:
    plt.figure()
    for epsilon in epsilon_list:
        sns.lineplot(
            x=range(reps),
            y=np.cumsum(rewards_in_time[(sigma, epsilon)]) / (np.arange(reps) + 1),
            label=f"$\epsilon$ = {epsilon}",
        )

    plt.legend(frameon=False)
    plt.xlabel("Iteración")
    plt.ylabel("Recompensa")
    plt.xlim(0, 1000)
    plt.savefig(f"../img/reward_iteration_sigma_{sigma}.png", transparent=True, bbox_inches="tight")
    plt.close()

In [None]:
for sigma in sigma_list:
    plt.figure()
    sns.barplot(x=range(k), y=rewards_per_sigma[sigma], hue=range(k))
    plt.xlabel("Brazo")
    plt.ylabel("Recompensa")
    plt.legend([], [], frameon=False)

    plt.savefig(f"../img/rewards_sigma_{sigma}.png", transparent=True, bbox_inches="tight")
    plt.close()

    plt.figure(frameon=False)
    sns.violinplot({f"$\epsilon$ = {epsilon}": bandits[sigma, epsilon].estimated_values for epsilon in epsilon_list})
    plt.ylabel("Q-valor")
    plt.savefig(f"../img/values_sigma_{sigma}.png", transparent=True, bbox_inches="tight")
    plt.close()