In [1]:
import numpy as np
import matplotlib.pyplot as plt

# Class that represents the multi-armed bandit environment
class BanditEnvironment:
   def __init__(self):
       # Initialization of the true reward probabilities for each arm
       self.true_rewards = [0.2, 0.8, 0.5, 0.3, 0.9]

   # Method that simulates executing an action (pulling an arm)
   def step(self, action):
       """Executes action and returns reward"""
       # Generates a binary reward (1 or 0) based on the probability of the selected arm
       reward = 1 if np.random.random() < self.true_rewards[action] else 0
       return reward

# Class that implements an epsilon-greedy agent
class EpsilonGreedyAgent:
   def __init__(self, n_actions, epsilon=0.1, decay_rate=0.99):
       # Number of possible actions (bandit arms)
       self.n_actions = n_actions
       # Initial exploration probability
       self.epsilon = epsilon
       # Decay rate for epsilon
       self.decay_rate = decay_rate
       # Estimated Q-values for each action (initialized to 0)
       self.q_values = np.zeros(n_actions)
       # Counter for how many times each action has been selected
       self.action_counts = np.zeros(n_actions)

   # Method to select an action following the epsilon-greedy strategy
   def select_action(self):
       # With probability epsilon, select a random action (exploration)
       if np.random.random() < self.epsilon:
           return np.random.randint(self.n_actions)
       # With probability 1 - epsilon, select the best known action (exploitation)
       else:
           return np.argmax(self.q_values)

   # Method to update the Q-value estimate of an action
   def update_q_value(self, action, reward):
       # Increment the counter for the selected action
       self.action_counts[action] += 1
       # Update the Q-value using incremental averaging
       self.q_values[action] += (reward - self.q_values[action]) / self.action_counts[action]

   # Method to gradually reduce epsilon (less exploration over time)
   def decay_epsilon(self):
       self.epsilon *= self.decay_rate

# Function that simulates the bandit problem with the epsilon-greedy agent
def simulate_bandit_problem():
   # Create the environment with the true rewards
   env = BanditEnvironment()
   # Create the agent with 5 actions and initial epsilon = 0.3
   agent = EpsilonGreedyAgent(n_actions=5, epsilon=0.3)

   # Lists to store reward and action history
   rewards_history = []
   actions_history = []

   # Main simulation loop (1000 steps)
   for step in range(1000):
       # 1. The agent selects an action
       action = agent.select_action()

       # 2. The environment returns a reward for that action
       reward = env.step(action)

       # 3. The agent updates its estimates with the received reward
       agent.update_q_value(action, reward)
       # Gradually reduce epsilon
       agent.decay_epsilon()

       # Store history for further analysis
       rewards_history.append(reward)
       actions_history.append(action)

   return rewards_history, actions_history, agent.q_values

# Run the simulation and obtain results
rewards, actions, final_q_values = simulate_bandit_problem()

# Print final results
print("Final estimated Q-values:", final_q_values)
print("True values:", [0.2, 0.8, 0.5, 0.3, 0.9])
print("Average reward:", np.mean(rewards))


Final estimated Q-values: [0.17647059 0.79977629 0.375      0.4        0.90789474]
True values: [0.2, 0.8, 0.5, 0.3, 0.9]
Average reward: 0.792
