## Imports

In [None]:
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt

%matplotlib inline

## K-armed Bandit Problem

A simple version of the k-armed bandit problem is useful because of its nonassociative nature. This is a good problem or environement to learn basic reinforcement learning methodes.

So let's create a simple gymnasium environement to re-create the k-armed bandit problem.

### The k-armed bandit problem

You are faced repeatedly with a choice among
k di↵erent options, or actions. After each choice you receive a numerical reward chosen
from a stationary probability distribution that depends on the action you selected. Your objective is to maximize the expected total reward over some time period, for example,
over 1000 action selections, or time steps.

In [None]:
class KArmedBanditNonStationary(gym.Env):

    def __init__(self, nb_arms=10, nb_steps=10_000):
        self._nb_arms = nb_arms
        self._nb_steps = nb_steps

        self.action_space = gym.spaces.Discrete(nb_arms)
        self.observation_space = gym.spaces.Discrete(1)
    
    def step(self, action):
        self._step += 1
    
        reward = self._arms[action]
        reward_noise = self.np_random.normal(0, 1, size=1)[0]
        terminated = self._step >= self._nb_steps

        info = { "is_optimal_action": int(action == np.argmax(self._arms)) }

        # Derivation
        self._arms += self.np_random.normal(0, 0.01, size=self._nb_arms)

        return reward + reward_noise, terminated, info

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)

        self._step = 0
        self._arms = self.np_random.normal(0, 1, size=self._nb_arms)


let's see if the implementation of the k-armed bandit is correct:

In [None]:
env = KArmedBanditNonStationary(nb_arms=10)
env.reset()

# Sample our distribution to see it's correct
data = np.array([[env.step(i)[0] for i in range(len(env._arms))] for _ in range(200)])
index = [i for i in range(len(env._arms))]

plt.violinplot(data, index, showmeans=True)

plt.xlabel('Action')
plt.ylabel('Reward')

plt.show()

It seems good !

In [None]:
class EpsilonGreedy():

    def __init__(self, nb_actions, epsilon, alpha):
        self.nb_actions = nb_actions
        self.epsilon = epsilon
        self.alpha = alpha

        self.q = np.zeros(self.nb_actions)

    def action(self):
        take_random_action_prob = np.random.uniform(0, 1)

        if take_random_action_prob < self.epsilon:
            return np.random.randint(0, self.nb_actions)
        else:
            return np.argmax(self.q)
    
    def observe(self, action, reward):
        self.q[action] += self.alpha * (reward - self.q[action])
    
    def reset(self):
        self.q = np.zeros(self.nb_actions)

In [None]:
env = KArmedBanditNonStationary(nb_arms=10)
agent = EpsilonGreedy(nb_actions=10, epsilon=0.01, alpha=0.1)

In [None]:
def run_env(env, agent):
    list_of_reward = []
    list_of_optimal_action = []

    env.reset()
    agent.reset()

    terminated = False

    while not terminated:
        action = agent.action()

        reward, terminated, info = env.step(action)

        agent.observe(action, reward)

        list_of_reward.append(reward)
        list_of_optimal_action.append(info["is_optimal_action"])
    
    return np.array(list_of_reward), np.array(list_of_optimal_action)

In [None]:
list_of_reward, list_of_optimal_action = run_env(env, agent)

In [None]:
plt.plot(list_of_reward)
plt.show()

In [None]:
plt.plot(list_of_optimal_action)
plt.show()

It's hard to see any result here, there is to much noise.

Repeating this for 2000 independent runs,
each with a di↵erent bandit problem, we obtained measures of the learning algorithm’s
average behavior.

In [None]:
def run_exp(nb_exps, env, agent):
    list_rewards, list_optimal_action = run_env(env, agent)

    for _ in range(nb_exps - 1):
        list_rewards_tmp, list_optimal_action_tmp = run_env(env, agent)

        list_rewards += list_rewards_tmp
        list_optimal_action += list_optimal_action_tmp

    return list_rewards / nb_exps, (list_optimal_action / nb_exps) * 100

In [None]:
env = KArmedBanditNonStationary(nb_arms=10)
env.reset()

In [None]:
agent_01 = EpsilonGreedy(nb_actions=10, epsilon=0.01, alpha=0.1)
mean_rewards_01, percent_optimal_action_01 = run_exp(2000, env, agent_01)

In [None]:
agent_1 = EpsilonGreedy(nb_actions=10, epsilon=0.1, alpha=0.1)
mean_rewards_1, percent_optimal_action_1 = run_exp(2000, env, agent_1)

In [None]:
agent_greedy = EpsilonGreedy(nb_actions=10, epsilon=0.0, alpha=0.1)
mean_rewards_0, percent_optimal_action_0 = run_exp(2000, env, agent_greedy)

In [None]:
plt.plot(mean_rewards_01, color="tab:red", label='Egreedy 0.01')

plt.plot(mean_rewards_1, color="tab:blue", label='Egreedy 0.1')

plt.plot(mean_rewards_0, color="tab:green", label='Egreedy 0.0 (greedy)')

plt.xlabel('Steps')
plt.ylabel('Reward')

plt.legend()
plt.show()

In [None]:
plt.plot(percent_optimal_action_01, color="tab:red", label='Egreedy 0.01')

plt.plot(percent_optimal_action_1, color="tab:blue", label='Egreedy 0.1')

plt.plot(percent_optimal_action_0, color="tab:green", label='Egreedy 0.0 (greedy)')

plt.xlabel('Steps')
plt.ylabel('% Optimal Action')

plt.legend()
plt.show()