In [None]:
import numpy as np
import matplotlib.pyplot as plt

class NonStationaryBandit:
    def _init_(self, n_arms=10):
        self.n_arms = n_arms
        self.mean_rewards = np.zeros(n_arms)

    def step(self, action):
        reward = np.random.normal(self.mean_rewards[action], 1)
        self.mean_rewards += np.random.normal(0, 0.01, self.n_arms)
        return reward

def modified_epsilon_greedy(bandit, n_steps=10000, epsilon=0.1, alpha=0.1):
    q_values = np.zeros(bandit.n_arms)
    rewards = np.zeros(n_steps)
    actions = np.zeros(n_steps)
    action_counts = np.zeros(bandit.n_arms)

    for step in range(n_steps):
        if np.random.rand() < epsilon:
            action = np.random.randint(bandit.n_arms)
        else:
            action = np.argmax(q_values)

        reward = bandit.step(action)
        action_counts[action] += 1

        q_values[action] += alpha * (reward - q_values[action])

        actions[step] = action
        rewards[step] = reward

    return rewards, actions

bandit = NonStationaryBandit()
n_steps = 10000
epsilon = 0.1
alpha = 0.1

rewards, actions = modified_epsilon_greedy(bandit, n_steps, epsilon, alpha)

plt.figure(figsize=(12, 6))
plt.plot(np.cumsum(rewards) / (np.arange(1, n_steps + 1)))
plt.title("Average Reward over Time in Modified Epsilon-Greedy Algorithm")
plt.xlabel("Time steps")
plt.ylabel("Average Reward")
plt.show()

action_counts = np.bincount(actions.astype(int))
plt.bar(range(bandit.n_arms), action_counts)
plt.title("Action Selection Counts")
plt.xlabel("Action")
plt.ylabel("Counts")
plt.show()