# Binary Bandit

In [None]:
import random
import matplotlib.pyplot as plt

# Define a BinaryBandit class to simulate the bandit problem
class BinaryBandit:
    def __init__(self):
        # Set the number of arms to 2
        self.n_arms = 2
    
    # Define a method to return the possible actions (arms) of the bandit problem
    def actions(self):
        return list(range(self.n_arms))
    
    # Define a method to simulate a pull of an arm and return the resulting reward
    def pull_arm(self, action , reward_1_2):
        # Define the probability of success for each arm
        p1 = [0.1, 0.2]
        p2 = [0.7, 0.8]
        # Generate a random number to determine if the pull is successful
        rand = random.random()
        if reward_1_2:
          p = p1
        else:
          p = p2

        if rand < p[action]:
            return 1
        else:
            return 0

# Define an eGreedy function to implement the epsilon-greedy algorithm
def eGreedy(my_bandit, epsilon, max_iteration):
    # Initialization
    Q = [0] * my_bandit.n_arms
    count = [0] * my_bandit.n_arms
    R = []
    R_avg = [0]
    # Set the maximum number of iterations
    max_iter = max_iteration
    
    # Incremental Implementation
    for i in range(1, max_iter):
        # Choose the action to take based on the current Q values and the exploration/exploitation tradeoff
        if random.random() > epsilon:
            # Exploit: Choose the action with the highest Q value
            action = Q.index(max(Q))
        else:
            # Explore: Choose a random action
            action = random.choice(my_bandit.actions())
        # Simulate a pull of the arm and get the resulting reward
        reward = my_bandit.pull_arm(action,False)
        R.append(reward)
        # Update the count and Q values for the chosen action
        count[action] += 1
        Q[action] += (reward - Q[action]) / count[action]
        # Calculate the average reward so far
        R_avg.append(R_avg[-1] + (reward - R_avg[-1]) / i)

    # Return the final Q values and the history of rewards
    return Q, R_avg, R

# Set the random seed for reproducibility
random.seed(10)

# Initialize the binary bandit problem
my_bandit = BinaryBandit()

# Apply the epsilon-greedy algorithm to the problem with a fixed exploration rate of R and maximum N iterations
R = 0.2
N = 100 

Q, R_avg, R = eGreedy(my_bandit, R, N)

# Plot the average reward and reward per iteration
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
ax1.plot(R_avg)
ax1.set_title("Average rewards vs Iteration")
ax1.set_xlabel("Iteration")
ax1.set_ylabel("Average Reward")
ax2.plot(R)
ax2.set_title("Reward per iteration")
ax2.set_xlabel("Iteration")
ax2.set_ylabel("Reward")