# Binary Bandit

In [None]:
import random
import matplotlib.pyplot as plt

# Define a BinaryBandit class to simulate the bandit problem
class BinaryBandit:
    def __init__(self):
        # Set the number of arms to 2
        self.n_arms = 2
    
    # Define a method to return the possible actions (arms) of the bandit problem
    def actions(self):
        return list(range(self.n_arms))
    
    # Define a method to simulate a pull of an arm and return the resulting reward
    def pull_arm(self, action ,p):
        # Define the probability of success for each arm
        rand = random.random()
        if rand < p[action]:
            return 1
        else:
            return 0

# Define an eGreedy function to implement the epsilon-greedy algorithm
def eGreedy(my_bandit, epsilon, max_iteration,p):
    # Initialization
    Q = [0] * my_bandit.n_arms
    count = [0] * my_bandit.n_arms
    R = []
    R_avg = [0]
    # Set the maximum number of iterations
    max_iter = max_iteration
    
    # Incremental Implementation
    for i in range(1, max_iter):
        # Choose the action to take based on the current Q values and the exploration/exploitation tradeoff
        if random.random() > epsilon:
            # Exploit: Choose the action with the highest Q value
            action = Q.index(max(Q))
        else:
            # Explore: Choose a random action
            action = random.choice(my_bandit.actions())
        # Simulate a pull of the arm and get the resulting reward
        reward = my_bandit.pull_arm(action,p)
        R.append(reward)
        # Update the count and Q values for the chosen action
        count[action] += 1
        Q[action] += (reward - Q[action]) / count[action]
        # Calculate the average reward so far
        R_avg.append(R_avg[-1] + (reward - R_avg[-1]) / i)

    # Return the final Q values and the history of rewards
    return Q, R_avg, R

# Set the random seed for reproducibility
random.seed(10)

# Initialize the binary bandit problem
my_bandit = BinaryBandit()

# Apply the epsilon-greedy algorithm to the problem with a fixed exploration rate of R and maximum N iterations
R = 0.2
N = 100 
p1 = [0.36 , 0.6 ]
p2 = [0.2 , 0.8]

# Plot the average reward and reward per iteration bandit 1
Q, R_avg, R = eGreedy(my_bandit, R, N , p1)
print("Bandit1 : ")
print(p1)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
ax1.plot(R_avg)
ax1.set_title("Average rewards vs Iteration")
ax1.set_xlabel("Iteration")
ax1.set_ylabel("Average Reward")
ax2.plot(R)
ax2.set_title("Reward per iteration")
ax2.set_xlabel("Iteration")
ax2.set_ylabel("Reward")

# Q2 : Code for n-Arm Bandit with modified ε-greedy

In [None]:
import random
import matplotlib.pyplot as plt

# Define the bandit environment class
class Bandit(object):
    def __init__(self, num_arms):
        # num_arms = number of arms in the bandit
        self.num_arms = num_arms
        # Assign same initial expected reward to all arms
        self.expected_rewards = [10] * num_arms 
        
    def actions(self):
        # Returns the actions available in the bandit environment
        return list(range(0, self.num_arms))
        
    def reward(self, action):
        # Update the expected rewards of all the arms by adding some noise
        for i in range(len(self.expected_rewards)):
            # Add some noise to the expected reward using Gaussian distribution
            self.expected_rewards[i] += random.gauss(0, 0.1)
        
        # Calculate the reward obtained on selecting the specified action
        # Add some noise to the reward using Gaussian distribution
        return self.expected_rewards[action] + random.gauss(0, 0.01)
         

def e_greedy(bandit, epsilon, max_iteration):
    # epsilon: the probability of selecting a random action (exploration)
    # max_iteration: maximum number of iterations
    
    # Initialize q_values for each arm to zero
    q_values = [0] * bandit.num_arms 
    # Initialize action_counts for each arm to zero
    action_counts = [0] * bandit.num_arms 
    # Initialize the total reward to zero
    total_reward = 0 
    # To store the rewards obtained at each iteration
    rewards = [] 
    # To store the average rewards obtained until each iteration
    avg_rewards = [0] 
    # Maximum iterations
    max_iter = max_iteration

    # Implement the e-greedy algorithm
    for i in range(1, max_iter):
        if random.random() > epsilon:
            # Exploit (greedy action)
            action = q_values.index(max(q_values))
        else:
            # Explore (random action)
            action = random.choice(bandit.actions())

        # Get the reward for the selected action
        reward = bandit.reward(action)
        rewards.append(reward)
        # Increment the action count for the selected action
        action_counts[action] += 1
        # Update the q_value for the selected action using incremental update rule
        q_values[action] += (reward - q_values[action]) / action_counts[action]
        # Update the total reward obtained so far
        total_reward += reward
        # Calculate the average reward and store it
        avg_rewards.append(total_reward / i)
        
    return q_values, avg_rewards, rewards

# Set the random seed for reproducibility
random.seed(10)

# Create a bandit environment with 10 arms
my_bandit = Bandit(10)

# Run the e-greedy algorithm with epsilon=0.36 and maximum iterations=20000
q_values, avg_rewards, rewards = e_greedy(my_bandit, 0.36, 20000)

# Print the actual expected rewards and the recovered expected rewards
print("Actual\tRecovered ")
for i, j in zip(my_bandit.expected_rewards, q_values):
    print(f"{i:.3f}\t{j:.3f}")

# Plot the average rewards obtained and the rewards obtained at each iteration
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
ax1.plot(R_avg)
ax1.set_title("Average rewards V/s Iteration")
ax1.set_xlabel("Iteration")
ax1.set_ylabel("Average Reward")
ax2.plot(R)
ax2.set_title("Reward per iteration")
ax2.set_xlabel("Iteration")
ax2.set_ylabel("Reward")
fig.suptitle("Unmodified Epsilon Greedy Policy")
plt.show()