# Stochastic MABs

É o problema básico de MAB. O algoritmo deve escolher entre K ações em T rodadas. Cada ação está ligada com uma distribuição de recompensa, que não muda ao longo das rodadas. O objetivo é descobrir a ação que traz a maior média de recompensas sem perder muito tempo explorando, obtendo maiores recompensas médias ao longo das T rodadas.

## Carregando bibliotecas

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from tqdm import tqdm

## Definindo constantes

In [2]:
SEED = 1234
NUM_ARMS = 10
NUM_ROUNDS = 1000


np.random.seed(seed=SEED)

## Criando classes básicas

### Ambiente

In [3]:
class MABEnvironment:
    
    def __init__(self, num_arms: int):
        self.num_arms = num_arms
        self.reward_distributions = np.random.uniform(low=0, high=1, size=num_arms)

    def step(self, action: int) -> int:
        return np.random.choice([0, 1], p=[1-self.reward_distributions[action], self.reward_distributions[action]])

    def get_best_arm_reward_prob(self) -> float:
        return np.max(self.reward_distributions)
    
    def get_best_arm_index(self) -> int:
        return np.argmax(self.reward_distributions)

    def display(self):
        best_arm_index = self.get_best_arm_index()

        df = pd.DataFrame({
            'Arm': [str(x+1) for x in range(self.num_arms)],
            'Reward Distribution': self.reward_distributions
        })

        fig = px.bar(df, x='Arm', y='Reward Distribution', title='Reward Distribution of Arms')
        fig['data'][0]['marker']['color'] = ['blue' if x != best_arm_index else 'green' for x in range(self.num_arms)]
        fig.show()

        display(df)
        display(df.describe())

env = MABEnvironment(num_arms=NUM_ARMS)

In [4]:
env.display()

Unnamed: 0,Arm,Reward Distribution
0,1,0.191519
1,2,0.622109
2,3,0.437728
3,4,0.785359
4,5,0.779976
5,6,0.272593
6,7,0.276464
7,8,0.801872
8,9,0.958139
9,10,0.875933


Unnamed: 0,Reward Distribution
count,10.0
mean,0.600169
std,0.282342
min,0.191519
25%,0.31678
50%,0.701042
75%,0.797744
max,0.958139


### Algoritmo (agente)

In [5]:
from abc import ABC, abstractmethod

class MABAlgorithm(ABC):
    
    @abstractmethod
    def __init__(self, num_arms: int):
        pass

    @abstractmethod
    def update(self, action: int, reward: int):
        pass

    @abstractmethod
    def select_action(self) -> int:
        pass
    
    def reset(self) -> int:
        self.last_action = self.select_action()
        return self.last_action
    
    def step(self, reward: int) -> int:
        self.update(self.last_action, reward)
        self.last_action = self.select_action()
        return self.last_action

In [6]:
class Random(MABAlgorithm):
        
    def __init__(self, num_arms: int):
        self.num_arms = num_arms

    def update(self, action: int, reward: int):
        return

    def select_action(self) -> int:
        return np.random.choice(range(self.num_arms))

### Experimento

In [7]:
class MABExperiment:
    
    def __init__(self, num_arms: int, num_rounds: int, algorithm: MABAlgorithm, environment: MABEnvironment):
        self.num_arms = num_arms
        self.num_rounds = num_rounds
        self.alg = algorithm
        self.env = environment

    def run(self, plot_graphics: bool=True):
        mean_rewards = []
        rewards_acum = 0
        action = self.alg.reset()
        actions_select_count = [0 for _ in range(self.num_arms)]

        for i in tqdm(range(1, self.num_rounds + 1)):
            actions_select_count[action] += 1
            reward = self.env.step(action)
            rewards_acum += reward
            mean_rewards.append(rewards_acum / i)
            action = self.alg.step(reward)
        
        if plot_graphics:
            return self.__plot_graphics(mean_rewards, actions_select_count)
        
        return mean_rewards, actions_select_count
    
    def __plot_graphics(self, mean_rewards: 'list[float]', actions_select_count: 'list[int]'):
        alg_name = self.alg.__class__.__name__
        best_reward_prob = self.env.get_best_arm_reward_prob()
        best_arm_index = self.env.get_best_arm_index()

        df_alg = pd.DataFrame({
            'Round': [x+1 for x in range(self.num_rounds)],
            'Reward': mean_rewards,
            'Type': [alg_name for _ in range(self.num_rounds)]
        })

        df_best = pd.DataFrame({
            'Round': [x+1 for x in range(self.num_rounds)],
            'Reward': [best_reward_prob for _ in range(self.num_rounds)],
            'Type': ['Best' for _ in range(self.num_rounds)]
        })

        avg_rewards_per_round_df = pd.concat([df_alg, df_best], ignore_index=True)

        fig = px.line(avg_rewards_per_round_df, x="Round", y="Reward", color='Type', title="Average Reward per Round")
        fig.show()
        
        df_actions = pd.DataFrame({
            'Arm': [str(x+1) for x in range(self.num_arms)],
            'Number of Selections': actions_select_count
        })

        fig = px.bar(df_actions, x='Arm', y='Number of Selections', title='Number of Selections of Each Arm')
        fig['data'][0]['marker']['color'] = ['blue' if x != best_arm_index else 'green' for x in range(self.num_arms)]
        return fig


In [8]:
random_results = MABExperiment(
    num_arms=NUM_ARMS, 
    num_rounds=NUM_ROUNDS,
    algorithm=Random(NUM_ARMS),
    environment=env
).run()

 68%|██████▊   | 680/1000 [00:00<00:00, 6776.63it/s]

100%|██████████| 1000/1000 [00:00<00:00, 4994.72it/s]


In [9]:
random_results

## Criando e testando os algoritmos

### Explore-first

Algoritmo bem simples, no início, explora cada ação N vezes. Após isso, na fase de aprofundamento, escolhe a ação que teve os melhores resultados (ou seja, a média de recompensas maior entre todas ações).

In [10]:
class ExploreFirst(MABAlgorithm):
        
    def __init__(self, num_arms: int, num_explore_steps_per_arm: int):
        self.num_arms = num_arms
        self.num_explore_steps_per_arm = num_explore_steps_per_arm
        self.current_explore_steps_count = 0
        self.current_arm = 0
        self.mean_rewards = [0 for _ in range(num_arms)]
        self.best_arm = None

    def update(self, action: int, reward: int):
        # Fase de aprofundamento - não há mais atualizações a serem feitas.
        if self.current_arm >= self.num_arms:
            return
        
        # Fase de exploração - explora cada braço por num_explore_steps_per_arm vezes, atualizando a média de recompensas de cada ação.
        self.current_explore_steps_count += 1
        self.mean_rewards[action] += reward
        if self.current_explore_steps_count == self.num_explore_steps_per_arm:
            self.mean_rewards[action] /= self.num_explore_steps_per_arm
            self.current_arm += 1
            self.current_explore_steps_count = 0
            if self.current_arm >= self.num_arms:
                self.best_arm = np.argmax(self.mean_rewards)

    def select_action(self) -> int:
        if self.best_arm is not None:
            return self.best_arm
        else:
            return self.current_arm

In [11]:
explore_first_results = MABExperiment(
    num_arms=NUM_ARMS, 
    num_rounds=NUM_ROUNDS,
    algorithm=ExploreFirst(NUM_ARMS, 25),
    environment=env
).run()

100%|██████████| 1000/1000 [00:00<00:00, 12373.02it/s]




In [12]:
explore_first_results = MABExperiment(
    num_arms=NUM_ARMS, 
    num_rounds=NUM_ROUNDS,
    algorithm=ExploreFirst(NUM_ARMS, 10),
    environment=env
).run()

100%|██████████| 1000/1000 [00:00<00:00, 12039.45it/s]


### Epsilon-Greedy

In [13]:
class EpsilonGreedy(MABAlgorithm):
        
    def __init__(self, num_arms: int, epsilon: float):
        self.num_arms = num_arms
        self.epsilon = epsilon
        self.rewards_acum = [0 for _ in range(num_arms)]
        self.actions_count = [0 for _ in range(num_arms)]
        self.rewards_mean = [0 for _ in range(num_arms)]

    def update(self, action: int, reward: int):
        self.rewards_acum[action] += reward
        self.actions_count[action] += 1
        self.rewards_mean[action] = self.rewards_acum[action] / self.actions_count[action]

    def select_action(self) -> int:
        if np.random.uniform() < self.epsilon:
            return np.random.choice(range(self.num_arms))
        else:
            return np.argmax(self.rewards_mean)

In [14]:
epsilon_greedy_results = MABExperiment(
    num_arms=NUM_ARMS, 
    num_rounds=NUM_ROUNDS,
    algorithm=EpsilonGreedy(NUM_ARMS, 0.1),
    environment=env
).run()

100%|██████████| 1000/1000 [00:00<00:00, 6176.94it/s]


In [15]:
class DecreasingEpsilonGreedy(MABAlgorithm):
        
    def __init__(self, num_arms: int, epsilon: float, alpha: float):
        self.num_arms = num_arms
        self.epsilon = epsilon
        self.alpha = alpha
        self.rewards_acum = [0 for _ in range(num_arms)]
        self.actions_count = [0 for _ in range(num_arms)]
        self.rewards_mean = [0 for _ in range(num_arms)]

    def update(self, action: int, reward: int):
        self.rewards_acum[action] += reward
        self.actions_count[action] += 1
        self.rewards_mean[action] = self.rewards_acum[action] / self.actions_count[action]
        self.epsilon *= self.alpha

    def select_action(self) -> int:
        if np.random.uniform() < self.epsilon:
            return np.random.choice(range(self.num_arms))
        else:
            return np.argmax(self.rewards_mean)

In [16]:
decreasing_epsilon_greedy_results = MABExperiment(
    num_arms=NUM_ARMS, 
    num_rounds=NUM_ROUNDS,
    algorithm=DecreasingEpsilonGreedy(NUM_ARMS, 0.5, 0.99),
    environment=env
).run()

100%|██████████| 1000/1000 [00:00<00:00, 11199.86it/s]
