# 10-armed Testbed

In [1]:
import numpy as np
from fontTools.misc.bezierTools import epsilon
from numpy.core.fromnumeric import shape
from numpy.lib.function_base import average
from tqdm import trange
import matplotlib
import matplotlib.pyplot as plt
from bandit import Bandit

matplotlib.use('Agg')

In [2]:
def simulate(runs, times, bandits):
    # region Summary
    """
    For any learning method, we can measure its performance and behavior as it improves with experience over 1000 time steps 
    when applied to 1 of the bandit problems. This makes up 1 run. Repeating this for 2000 independent runs, each with a different 
    bandit problem, we obtained measures of the learning algorithm’s average behavior.
    :param runs: Number of runs
    :param times: Number of times
    :param bandits: Bandit problems
    :return: Optimal action count mean and reward mean
    """
    # endregion Summary
    
    # region Body
    
    # Prepare a matrix filled with 0s for rewards
    rewards = np.zeros((len(bandits), runs, times))
    
    # Prepare a matrix filled with 0s for optimal action counts that has the same shape as rewards matrix
    optimal_action_counts = np.zeros(rewards.shape)

    # For every bandit
    for i,bandit in enumerate(bandits):
        # for every run
        for run in trange(runs):
            # initialize bandit
            bandit.initialize()
            
            # for every time step
            for time in range(times):
                # select an action
                action = bandit.act()
                
                # get the reward
                rewards[i,run,time] = bandit.step(action)
                
                # if the selected action is optimal for bandit
                if action == bandit.optimal_action:
                    # change the corresponding 0 in the optimal action counts matrix to 1
                    optimal_action_counts[i,run,time] = 1

    return optimal_action_counts.mean(axis=1), rewards.mean(axis=1)

    # endregion Body

## 1. Reward Distribution

In [3]:
# Plot an example reward distribution
plt.violinplot(dataset=np.random.randn(200, 10) + np.random.randn(10))
plt.title("Figure 2.1")
plt.xlabel("Action")
plt.ylabel("Reward distribution")
plt.savefig("../generated_images/figure_2_1.png")
plt.close()

## 2. Greedy Action Selection VS ε-greedy Action Selection

In [4]:
# Create a list of epsilons with 0, 0.1 and 0.01 values
epsilons = [0, 0.1, 0.01]

# Create a list of bandits (1 bandit for every epsilon) where every bandit uses sample-average method
bandits = [Bandit(epsilon=epsilon, use_sample_averages=True) for epsilon in epsilons]

In [5]:
# Define number of runs
runs = 2000

# Define number of times
times = 1000

# Simulate optimal action counts and rewards
optimal_action_counts, rewards = simulate(runs, times, bandits)

100%|██████████| 2000/2000 [03:51<00:00,  8.63it/s]
100%|██████████| 2000/2000 [04:03<00:00,  8.21it/s]
100%|██████████| 2000/2000 [02:48<00:00, 11.87it/s]


In [6]:
# Plotting
plt.figure(figsize=(10, 20))

<Figure size 1000x2000 with 0 Axes>

In [7]:
plt.subplot(2, 1, 1)
for epsilon, rewards in zip(epsilons, rewards):
    plt.plot(rewards, label="$\epsilon = %.02f$" % epsilon)
plt.title("Figure 2.2")
plt.xlabel("Steps")
plt.ylabel("Average reward")
plt.legend()

<matplotlib.legend.Legend at 0x272ff89be90>

In [8]:
plt.subplot(2, 1, 2)
for epsilon, counts in zip(epsilons, optimal_action_counts):
    plt.plot(counts, label="$\epsilon = %.02f$" % epsilon)
plt.xlabel("Steps")
plt.ylabel("% Optimal action")
plt.legend()

<matplotlib.legend.Legend at 0x272fff69410>

In [9]:
plt.savefig("../generated_images/figure_2_2.png")
plt.close()

## 3. Optimistic Initial Values VS Realistic Initial Values

In [10]:
# Create a list of 2 bandits where:
# 1. 1st bandit: ε = 0, 𝑄_1(𝑎) = 5, 𝛼 = 0.1,
# 2. 2nd bandit: ε = 0.1, 𝑄_1(𝑎) = 0, 𝛼 = 0.1
bandits  = [Bandit (epsilon=0, initial_action_value_estimates=5,step_size=0.1),
            Bandit(epsilon=0.1,initial_action_value_estimates=0, step_size=0.1),]

In [11]:
# Define number of runs
runs = 2000

# Define number of times
times = 1000

# Simulate optimal action counts
optimal_action_counts, _ = simulate(runs, times, bandits)

100%|██████████| 2000/2000 [04:46<00:00,  6.97it/s]
100%|██████████| 2000/2000 [01:39<00:00, 20.03it/s]


In [12]:
# Plotting
plt.plot(optimal_action_counts[0], label="$\epsilon = 0,Q1 = 5$")
plt.plot(optimal_action_counts[1], label="$\epsilon = 0.1,Q1 = 0$")
plt.title("Figure 2.3")
plt.xlabel("Steps")
plt.ylabel("% Optimal action")
plt.legend()

plt.savefig("../generated_images/figure_2_3.png")
plt.close()

## 4. Upper-Confidence-Bound (UCB) Action Selection

In [3]:
# Create a list of 2 bandits where:
# 1. 1st bandit: ε = 0, 𝑐 = 2, uses sample-average method,
# 2. 2nd bandit: ε = 0.1, uses sample-average method
bandits  = [Bandit (epsilon=0, confidence_level=2, use_sample_averages=True),
            Bandit(epsilon=0.1, confidence_level=2, use_sample_averages=True)]

In [8]:
# Define number of runs
runs = 2000

# Define number of times
times = 1000

# Simulate average rewards
_, average_rewards = simulate(runs, times, bandits)

100%|██████████| 2000/2000 [04:09<00:00,  8.03it/s]
100%|██████████| 2000/2000 [04:11<00:00,  7.95it/s]


In [9]:
# Plotting
plt.plot(average_rewards[0], label="UCB $c = 2$")
plt.plot(average_rewards[1], label="$\epsilon-greedy $\epsilon = 0.1$")
plt.title("Figure 2.4")
plt.xlabel("Steps")
plt.ylabel("Average reward")
plt.legend()

plt.savefig("../generated_images/figure_2_4.png")
plt.close()

## 5. Gradient Bandit Algorithms (GBA)

In [4]:
# Create a list of 4 bandits where:
# 1. 1st bandit: uses GBA, 𝛼 = 0.1, uses average reward as baseline for GBA, expects true reward of 4,
# 2. 2nd bandit: uses GBA, 𝛼 = 0.1, doesn't use average reward as baseline for GBA, expects true reward of 4,
# 3. 3rd bandit: uses GBA, 𝛼 = 0.4, uses average reward as baseline for GBA, expects true reward of 4,
# 4. 4th bandit: uses GBA, 𝛼 = 0.4, doesn't use average reward as baseline for GBA, expects true reward of 4
bandits  = [Bandit(use_gradient=True, step_size=0.1, use_gradient_baseline=True, true_expected_reward=4),
            Bandit(use_gradient=True, step_size=0.1, use_gradient_baseline=False, true_expected_reward=4),
            Bandit(use_gradient=True, step_size=0.4, use_gradient_baseline=True, true_expected_reward=4),
            Bandit(use_gradient=True, step_size=0.4, use_gradient_baseline=False, true_expected_reward=4)]

In [5]:
# Define number of runs
runs = 2000

# Define number of times
times = 1000

# Simulate optimal action counts\
optimal_action_counts, _ = simulate(runs, times, bandits)

100%|██████████| 2000/2000 [05:06<00:00,  6.53it/s]
100%|██████████| 2000/2000 [06:03<00:00,  5.50it/s]
100%|██████████| 2000/2000 [02:50<00:00, 11.74it/s]
100%|██████████| 2000/2000 [02:57<00:00, 11.24it/s]


In [13]:
# Labels
labels = [r"$\alpha = 0.1$, with baseline", r"$\alpha = 0.1$, without baseline",
          r"$\alpha = 0.4$, with baseline", r"$\alpha = 0.4$, without baseline"]

In [14]:
# Plotting
for i in range(len(bandits)):
    plt.plot(optimal_action_counts[i], label=labels[i])

In [15]:
plt.title("Figure 2.5")
plt.xlabel("Steps")
plt.ylabel("% Optimal action")
plt.legend()

plt.savefig("../generated_images/figure_2_5.png")
plt.close()


# 6. All Methods Comparison


In [8]:
# Create bandits for all methods
bandits_all = [
    Bandit(epsilon=0, use_sample_averages=True),  # Greedy
    Bandit(epsilon=0.1, use_sample_averages=True),  # ε-greedy
    Bandit(epsilon=0, initial_action_value_estimates=5, step_size=0.1),  # Optimistic Initial Values
    Bandit(epsilon=0, confidence_level=2, use_sample_averages=True),  # UCB
    Bandit(use_gradient=True, step_size=0.1, use_gradient_baseline=True)  # Gradient Bandit
]

# Run simulation
runs = 2000
times = 1000
_, average_rewards_all = simulate(runs, times, bandits_all)

# Plot settings
plt.figure(figsize=(12, 6))
labels = [
    'Greedy (ε=0)',
    'ε-greedy (ε=0.1)',
    'Optimistic (Q1=5, α=0.1)',
    'UCB (c=2)',
    'Gradient Bandit (α=0.1)'
]

# Plot all methods
for i, reward in enumerate(average_rewards_all):
    plt.plot(reward, label=labels[i])

plt.title("Comparison of All Methods")
plt.xlabel("Steps", fontsize=12)
plt.ylabel("Average Reward", fontsize=12)
plt.legend(loc='lower right', fontsize=10)
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig("../generated_images/figure_2_6.png")
plt.close()

100%|██████████| 2000/2000 [02:19<00:00, 14.30it/s]
100%|██████████| 2000/2000 [02:31<00:00, 13.24it/s]
100%|██████████| 2000/2000 [02:13<00:00, 14.95it/s]
100%|██████████| 2000/2000 [02:52<00:00, 11.62it/s]
100%|██████████| 2000/2000 [03:41<00:00,  9.03it/s]
