In [None]:
import random
import math as m
import matplotlib.pyplot as plt

In [None]:
"""
Helper function that adds history information to the appropriate list.
"""
def addHistory(history, choice, reward):
    # For each action, add an entry into the history list
    for i in range(len(history)):
        # If this action was chosen, add the updated reward
        if (i == choice):
            history[i].append(reward)
        # Otherwise, add a 0 if the list is empty or the previous value if not
        else:
            if (len(history[i]) == 0):
                history[i].append(0)
            else:
                history[i].append(history[i][-1])

In [None]:
"""
Main logic for the epsilon greedy version of the bandit problem.
Taken in an array of arrays (actions) and the epsilon probability.
Each element in actions has the structure [action, mean_reward, variance]
"""
def banditEpsilon(actions, epsilon, title):
    steps = 1000        # Number of steps to run the algorithm
    q = []              # Expected reward of each action
    timesChosen = []    # List of the number of times an action was choesn
    history = []        # List of lists to represent choice history
    # Create an array for each action that tracks expected reward at each step.
    for i in range(len(actions)):
        history.append([])

    # Run the algorithm for a predefined number of steps.
    for i in range (steps):
        # Initialize the expected values of q by taking each action once.
        if i < len(actions):
            # Take a normal random sample using the mean and variance given.
            reward = random.normalvariate(actions[i][1], m.sqrt(actions[i][2]))
            timesChosen.append(1)
            q.append(reward)
            addHistory(history, i, reward)
        # Select a random choice using epsilon to decide actions
        else:
            # Decide the best choice for any given step
            for i in range(len(q)):
                if q[i] == max(q):
                    bestChoice = i
                    break
            # Select a value between 0 and 1 and determine action
            p = random.uniform(0,1)
            if (p < 1 - epsilon):
                choice = bestChoice
            else:
                choice = random.randint(0, len(actions) - 1)
                while (choice == bestChoice):
                    choice = random.randint(0, len(actions) - 1)
            reward = random.normalvariate(actions[choice][1], m.sqrt(actions[choice][2]))
            timesChosen[choice] += 1
            q[choice] = q[choice] + ((reward - q[choice]) / timesChosen[choice])
            addHistory(history, choice, q[choice])
    
    # Generating the plot from the history list for each action
    plt.plot(range(1, 1001), history[0], label = "Action A: Mean = " + str(actions[0][1]) + ", Variance = " + str(actions[0][2]))
    plt.plot(range(1, 1001), history[1], label = "Action B: Mean = " + str(actions[1][1]) + ", Variance = " + str(actions[1][2]))
    plt.plot(range(1, 1001), history[2], label = "Action C: Mean = " + str(actions[2][1]) + ", Variance = " + str(actions[2][2]))
    plt.title(title)
    plt.legend()
    plt.show()

In [None]:
# Epsilon and action values for part a
e = 0.2
a = [["A", 5, 3], ["B", 10, 6], ["C", 15, 15]]
banditEpsilon(a, e, "Part a: epsilon = 0.2")

In [None]:
# Epsilon and action values for part b
e = 0.1
a = [["A", 5, 3], ["B", 10, 6], ["C", 15, 15]]
banditEpsilon(a, e, "Part b: epsilon = 0.1")


In [None]:
# Epsilon and action values for part c
e = 0
a = [["A", 5, 3], ["B", 10, 6], ["C", 15, 15]]
banditEpsilon(a, e, "Part c: epsilon = 0")


In [None]:
# Epsilon and action values for part d
e = 0.2
a = [["A", 5, 5], ["B", 10, 10], ["C", 15, 15]]
banditEpsilon(a, e, "Part d: epsilon = 0.2")


In [None]:
# Epsilon and action values for part e
e = 0.1
a = [["A", 5, 5], ["B", 10, 10], ["C", 15, 15]]
banditEpsilon(a, e, "Part e: epsilon = 0.1")


In [None]:
# Epsilon and action values for part f
e = 0
a = [["A", 5, 5], ["B", 10, 10], ["C", 15, 15]]
banditEpsilon(a, e, "Part f: epsilon = 0")


In [None]:
# Epsilon and action values for part g
e = 0.2
a = [["A", 5, 1], ["B", 10, 1], ["C", 15, 1]]
banditEpsilon(a, e, "Part g: epsilon = 0.2")


In [None]:
# Epsilon and action values for part h
e = 0.1
a = [["A", 5, 1], ["B", 10, 1], ["C", 15, 1]]
banditEpsilon(a, e, "Part h: epsilon = 0.1")


In [None]:
# Epsilon and action values for part i
e = 0
a = [["A", 5, 1], ["B", 10, 1], ["C", 15, 1]]
banditEpsilon(a, e, "Part i: epsilon = 0")


In [None]:
# Epsilon and action values for part j
e = 0.1
a = [["A", 10, 8], ["B", 10, 5], ["C", 10, 12]]
banditEpsilon(a, e, "Part j: epsilon = 0.1")
