## Imports

In [None]:
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt
%matplotlib inline

## K-armed Bandit Problem

A simple version of the k-armed bandit problem is useful because of its nonassociative nature. This is a good problem or environement to learn basic reinforcement learning methodes.

So let's create a simple gymnasium environement to re-create the k-armed bandit problem.

In [None]:
class KArmedBandit(gym.Env):

    def __init__(self, nb_arms=10, nb_steps=1000):
        self._nb_arms = nb_arms
        self._nb_steps = nb_steps

        self.action_space = gym.spaces.Discrete(nb_arms)
        self.observation_space = gym.spaces.Discrete(1)
    
    def step(self, action):
        self._step += 1
    
        reward = self._arms[action]
        reward_noise = self.np_random.normal(0, 1, size=1)[0]
        terminated = self._step >= self._nb_steps

        return reward + reward_noise, terminated

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)

        self._step = 0
        self._arms = self.np_random.normal(0, 1, size=self._nb_arms)


In [None]:
class EpsilonGreedy():

    def __init__(self, nb_actions, epsilon):
        self.nb_actions = nb_actions
        self.epsilon = epsilon

        self.sum_of_rewards = np.zeros(self.nb_actions)
        self.nb_action_taken = np.ones(self.nb_actions)

    def action(self):
        take_random_action_prob = np.random.uniform(0, 1)

        if take_random_action_prob < self.epsilon:
            return np.random.randint(0, self.nb_actions)
        else:
            return np.argmax(self.sum_of_rewards / self.nb_action_taken)
    
    def observe(self, action, reward):
        self.sum_of_rewards[action] += reward
        self.nb_action_taken[action] += 1
    
    def reset(self):
        self.sum_of_rewards = np.zeros(self.nb_actions)
        self.nb_action_taken = np.ones(self.nb_actions)

In [None]:
env = KArmedBandit(nb_arms=10)
env.reset()

In [None]:
fig, ax = plt.subplots()

data = np.array([[env.step(i)[0] for i in range(len(env._arms))] for _ in range(200)])
index = [i for i in range(len(env._arms))]
print(data.shape)
vp = ax.violinplot(data, index, showmeans=True)

plt.show()

In [None]:
env = KArmedBandit(nb_arms=10)
agent = EpsilonGreedy(nb_actions=10, epsilon=0.01)

In [None]:
def run_env(env, agent):
    list_of_reward = []

    env.reset()
    agent.reset()

    terminated = False

    while not terminated:
        action = agent.action()

        reward, terminated = env.step(action)

        agent.observe(action, reward)

        list_of_reward.append(reward)
    
    return np.array(list_of_reward)

In [None]:
list_of_reward = run_env(env, agent)

In [None]:
plt.plot(list_of_reward)
plt.show()

Repeating this for 2000 independent runs,
each with a di↵erent bandit problem, we obtained measures of the learning algorithm’s
average behavior.

In [None]:
def run_exp(nb_exps, env, agent):
    list_rewards = run_env(env, agent)

    for _ in range(nb_exps - 1):
        list_rewards += run_env(env, agent)

    return list_rewards / nb_exps

In [None]:
env = KArmedBandit(nb_arms=10)
env.reset()

In [None]:
agent_01 = EpsilonGreedy(nb_actions=10, epsilon=0.01)
mean_rewards_01 = run_exp(2000, env, agent_01)

In [None]:
agent_1 = EpsilonGreedy(nb_actions=10, epsilon=0.1)
mean_rewards_1 = run_exp(2000, env, agent_1)

In [None]:
agent_greedy = EpsilonGreedy(nb_actions=10, epsilon=0.0)
mean_rewards_0 = run_exp(2000, env, agent_greedy)

In [None]:
plt.plot(mean_rewards_01, color="tab:red", label='Egreedy 0.01')

plt.plot(mean_rewards_1, color="tab:blue", label='Egreedy 0.1')

plt.plot(mean_rewards_0, color="tab:green", label='Egreedy 0.0 (greedy)')

plt.xlabel('Steps')
plt.ylabel('Reward')

plt.legend()
plt.show()