In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

K_ARMS = 10

class Bandit:
    """Environment for the k-armed bandit problem"""

    def __init__(self, k_arms=K_ARMS):
        self.k = k_arms
        self.reset()

    def step(self, action):
        if action < 0 or action >= self.k:
            raise ValueError("Invalid action")
        return np.random.normal(self.q_star[action], 1)
    
    def reset(self):
        self.q_star = np.random.normal(0, 1, self.k)

In [None]:
class EpsilonGreedyAgent:
    """Agent that chooses a random action with probability epsilon,
       otherwise the action with the highest estimated value"""

    def __init__(self, epsilon=0.1, k_arms=K_ARMS):
        self.k = k_arms
        self.epsilon = epsilon
        self.reset()

    def choose_action(self):
        if np.random.random() < self.epsilon:
            return np.random.randint(self.k)
        return np.argmax(self.q_est)

    def update(self, action, reward):
        """Increase the action count and update the estimated value for the chosen action"""
        self.action_count[action] += 1
        self.q_est[action] += (reward - self.q_est[action]) / self.action_count[action]
    
    def reset(self):
        self.q_est = np.zeros(self.k) # estimated expected rewards for each lever
        self.action_count = np.zeros(self.k) # number of times each lever has been pulled

In [None]:
def run_experiment(agent, env, runs=2000, steps=1000):
    logs = []
    for run in range(runs):
        agent.reset()
        env.reset()
        for step in range(steps):
            A = agent.choose_action()
            R = env.step(A)
            agent.update(A, R)
            logs.append((run, step, R))
    return logs

def plot_results(results):
    _ = plt.figure(figsize=(12, 5))
    for eps in results:
        df = pd.DataFrame(np.array(results[eps]), columns=["run", "step", "reward"])
        plt.plot(df[['step', 'reward']].groupby("step").mean(), label=eps)
        plt.legend()
        plt.title("Average reward over time")
        plt.xlabel("Step")
        plt.ylabel("Average reward")

In [None]:
RUNS = 2000
STEPS = 2000
EPSILONS = [0.0, 0.01, 0.1]
results = {}
env = Bandit()
for eps in EPSILONS:
    agent = EpsilonGreedyAgent(epsilon=eps)
    results[eps] = run_experiment(agent, env, RUNS, STEPS)
plot_results(results)

In [None]:
class NonstationaryBandit(Bandit):
    """Nonstationary k-armed bandit environment"""

    def __init__(self, k_arms=K_ARMS, step_size=0.01):
        super().__init__(k_arms)
        self.step_size = step_size # random walk step size

    def step(self, action):
        reward = super().step(action)
        self.q_star += np.random.normal(0, self.step_size, self.k)
        return reward

In [None]:
class EpsilonGreedyAgentNonstationary(EpsilonGreedyAgent):
    """Agent that chooses a random action with probability epsilon,
       otherwise the action with the highest estimated value"""

    def __init__(self, epsilon=0.1, k_arms=K_ARMS, step_size=0.1):
        super().__init__(epsilon, k_arms)
        self.step_size = step_size

    def update(self, action, reward):
        """Increase the action count and update the estimated value for the chosen action"""
        self.action_count[action] += 1
        self.q_est[action] += self.step_size * (reward - self.q_est[action])

In [None]:
RUNS = 2000
STEPS = 2000
EPSILONS = [0.0, 0.01, 0.1]
results = {}
env = NonstationaryBandit()
for eps in EPSILONS:
    agent = EpsilonGreedyAgentNonstationary(epsilon=eps)
    results[eps] = run_experiment(agent, env, RUNS, STEPS)

plot_results(results)

In [None]:
RUNS = 1000
STEPS = 2000
EPSILONS = [0.0, 0.01, 0.1]
results = {}
env = NonstationaryBandit()
for eps in EPSILONS:
    agent = EpsilonGreedyAgent(epsilon=eps)
    results[eps] = run_experiment(agent, env, RUNS, STEPS)

plot_results(results)