# Bandit, Exploration and Exploitation

The goal of this exercise is to implement a simple bandit algorithm and test it on a simple environment. 

We will first start by understanding the problem. 

## 1. Understanding the Bandit Problem

In [1]:
import gymnasium as gym
import buffalo_gym

env = gym.make("Buffalo-v0", arms=3)
obs = env.reset()
count = 0
while count < 10:
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    print(f"Action: {action} - Reward: {reward}")
    count += 1
env.close()

Action: 0 - Reward: 8.712120281027941
Action: 2 - Reward: 3.2208998151730412
Action: 2 - Reward: 1.830416837274484
Action: 1 - Reward: 2.2391969627753454
Action: 2 - Reward: 3.5057851711861545
Action: 1 - Reward: -2.1048408678210935
Action: 2 - Reward: 2.3880164249171147
Action: 0 - Reward: 9.584807438315394
Action: 1 - Reward: 0.8030161050395389
Action: 2 - Reward: 3.249491480681207


Answer the following questions:
1. What the code is doing?
2. What is the best option to take?
3. What is the expected reward of taking the best option?

## 2. Implementing an Incremental Update Rule

Complete the function `incremental_update` to implement the incremental update rule for the action-value estimates.

In [None]:
def incremental_update(Q, Times,action, reward):
    """
    Update the action-value estimate Q for the given action and reward using an incremental update rule.

    Parameters:
    Q (list): A list of action-value estimates for each action.
    Times (list): A list of counts of how many times each action has been taken.
    action (int): The index of the action taken.
    reward (float): The reward received after taking the action.

    Returns:
    list: Updated list of action-value estimates.
    """
    
    # Your code here

    return Q


And execute the following code to test your implementation:

In [None]:
import gymnasium as gym
import buffalo_gym

arms = 10
Q = [0.0 for _ in range(arms)]
Times = [0 for _ in range(arms)]
env = gym.make("Buffalo-v0", arms=arms)
obs = env.reset()
done = False
while not done:
    action = env.action_space.sample()
    obs, reward, terminated, truncated, info = env.step(action)
    #print(f"Action: {action} - Reward: {reward}")
    Times[action] += 1
    Q = incremental_update(Q, Times, action, reward)
    done = terminated or truncated
env.close()

print("Final action-value estimates:", Q)

Questions: 
1. Which is the best action?
2. What is the expected reward of taking the best action?

## 3. Greedy Action Selection

Now that we have implemented the incremental update rule, we can implement a greedy action selection strategy.
Complete the function `greedy_action_selection` to implement a greedy action selection strategy.
This function will be replace the instruction `action = env.action_space.sample()` in the code above.

In [None]:
import numpy as np

# We don't want to use the argmax function from numpy because it doesn't break ties randomly. 
# We want to implement our own version of argmax that breaks ties randomly.

def argmax(q_values):
    """
    Takes in a list of q_values and returns the index of the item 
    with the highest value. Breaks ties randomly.
    returns: int - the index of the highest value in q_values
    """
    top_value = float("-inf")
    ties = []
    
    for i in range(len(q_values)):
        # if a value in q_values is greater than the highest value update top and reset ties to zero
        # if a value is equal to top value add the index to ties
        
        # your code here!
        ties = ties # replace this with your code to update ties
        
    # return a random selection from ties.
    return np.random.choice(ties) 

In [None]:
# --------------
# Debugging Cell
# --------------
# Feel free to make any changes to this cell to debug your code

test_array = [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
assert argmax(test_array) == 8, "Check your argmax implementation returns the index of the largest value"

# make sure np.random.choice is called correctly
np.random.seed(0)
test_array = [1, 0, 0, 1]

assert argmax(test_array) == 0

In [None]:
# More testing to make sure argmax does not always choose first entry

test_array = [0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
assert argmax(test_array) == 8, "Check your argmax implementation returns the index of the largest value"

# set random seed so results are deterministic
np.random.seed(0)
test_array = [1, 0, 0, 1]

counts = [0, 0, 0, 0]
for _ in range(100):
    a = argmax(test_array)
    counts[a] += 1

# make sure argmax does not always choose first entry
assert counts[0] != 100, "Make sure your argmax implementation randomly choooses among the largest values."

# make sure argmax does not always choose last entry
assert counts[3] != 100, "Make sure your argmax implementation randomly choooses among the largest values."

# make sure the random number generator is called exactly once whenver `argmax` is called
expected = [44, 0, 0, 56] # <-- notice not perfectly uniform due to randomness
assert counts == expected

In [None]:

def greedy_action_selection(Q):
    """
    Select an action using a greedy action selection strategy based on the action-value estimates Q.

    Parameters:
    Q (list): A list of action-value estimates for each action.

    Returns:
    int: The index of the selected action.
    """
    # Your code here
    return 0 

In [None]:
# In this version, we will run 1000 steps of the 
# bandit problem and calculate the accumulated reward at each step.
# We will run 100 times and average the rewards over time to see if 
# the agent is learning to select the best action.

import gymnasium as gym
import buffalo_gym

arms = 10
steps = 1000
runs = 2000

average_rewards = np.zeros(steps)

for run in range(runs):

    Q = [0.0 for _ in range(arms)]
    Times = [0 for _ in range(arms)]

    Rewards = [0.0 for _ in range(steps)]
    env = gym.make("Buffalo-v0", arms=arms)

    obs = env.reset()
    step = 0
    while step < steps:
        action = greedy_action_selection(Q)
        obs, reward, terminated, truncated, info = env.step(action)
        #print(f"Action: {action} - Reward: {reward}")
        Times[action] += 1
        Q = incremental_update(Q, Times, action, reward)
        Rewards[step] = reward
        step += 1

    env.close()
    #print("Final action-value estimates:", Q)
    average_rewards += np.array(Rewards)

average_rewards /= runs

# plot the average rewards over steps
import matplotlib.pyplot as plt
plt.plot(average_rewards)
plt.xlabel("Steps")
plt.ylabel("Average Reward")
plt.title("Average Reward over Steps")
plt.show()

Questions: 
1. Is the agent able to find the best action?
2. Are the rewards improving over time?

## 4. Epsilon-Greedy Action Selection

Now that we have implemented a greedy action selection strategy, we can implement an epsilon-greedy action selection strategy.
Complete the function `epsilon_greedy_action_selection` to implement an epsilon-greedy action selection strategy.
This function will be replace the instruction `action = greedy_action_selection(Q)` in the code above.

In [None]:
def epsilon_greedy_action_selection(Q, epsilon):
    """
    Select an action using an epsilon-greedy action selection strategy based on the action-value estimates Q.

    Parameters:
    Q (list): A list of action-value estimates for each action.
    epsilon (float): The probability of selecting a random action (exploration rate).

    Returns:
    int: The index of the selected action.
    """
    # Your code here
    return 0

In [None]:
# In this version, we will run 1000 steps of the 
# bandit problem and calculate the accumulated reward at each step.
# We will run 100 times and average the rewards over time to see if 
# the agent is learning to select the best action.

import gymnasium as gym
import buffalo_gym

arms = 10
steps = 1000
runs = 2000

average_rewards_ep = [0.0 for _ in range(steps)]

for run in range(runs):

    Q = [0.0 for _ in range(arms)]
    Times = [0 for _ in range(arms)]

    Rewards = [0.0 for _ in range(steps)]
    env = gym.make("Buffalo-v0", arms=arms)

    obs = env.reset()
    step = 0
    while step < steps:
        action = epsilon_greedy_action_selection(Q, epsilon=0.1)
        obs, reward, terminated, truncated, info = env.step(action)
        #print(f"Action: {action} - Reward: {reward}")
        Times[action] += 1
        Q = incremental_update(Q, Times, action, reward)
        Rewards[step] = reward
        step += 1

    env.close()
    #print("Final action-value estimates:", Q)
    average_rewards_ep += np.array(Rewards)

average_rewards_ep /= runs

# plot the average of Rewards over steps
import matplotlib.pyplot as plt
plt.plot(average_rewards_ep)
plt.xlabel("Steps")
plt.ylabel("Average Reward")
plt.title("Average Reward over Steps")
plt.show()

In [None]:
# plot both greedy and epsilon-greedy rewards over steps
import matplotlib.pyplot as plt
plt.plot(average_rewards, label="Greedy")
plt.plot(average_rewards_ep, label="Epsilon-Greedy e=0.1")
plt.xlabel("Steps")
plt.ylabel("Average Reward")
plt.title("Average Reward over Steps")
plt.legend()
plt.show()