In [3]:
import numpy as np
import matplotlib.pyplot as plt

In [29]:
class Testbed:
    def __init__(self, real_reward = 1, time_mean = 0, time_variance = 0.1, steps = int(1e4), init_estimates = 0,
                eps = 0.1, alpha = 0.1, use_alpha = True):
        
        self.num_bandits = 10
        self.time = 0
    
        #keep action counts of each arm
        self.action_counts = np.zeros((self.num_bandits))
        
        #self.true_action_val is non stationary
        self.real_reward = real_reward
        self.true_action_val = np.array([self.real_reward for i in range(self.num_bandits)])
        self.true_action_val = np.random.normal(size = self.num_bandits) + self.true_action_val
        #non-stationary setting
        self.time_mean = time_mean
        self.time_variance = time_variance
        
        #estimated expected reward function
        self.q_estimates = np.array([init_estimates for i in range(self.num_bandits)], dtype = np.float64)
        self.init_estimates = init_estimates
        
        #process variables
        self.steps = steps
        self.eps = eps #e-greedy epsilon
        self.alpha = alpha #step size 
        self.use_alpha = use_alpha #if false, uses sample mean instead
        
        self.indices = np.arange(self.num_bandits)
        self.best_action = np.argmax(self.true_action_val)
    
    def select(self):
        #print(self.q_estimates)
        #selects the arm using e-greedy method
        toss = np.random.uniform()
        if(toss < self.eps):
            return np.random.choice(self.indices)
        return np.argmax(self.q_estimates)
        
    def get_reward(self, arm_id : int):
        #print(arm_id, self.true_action_val[arm_id])
        return self.true_action_val[arm_id]
    
    def update(self, arm_id : int, step_reward):
        self.time += 1
        self.action_counts[arm_id] += 1
        if self.use_alpha:
            self.q_estimates[arm_id] += self.alpha * (step_reward - self.q_estimates[arm_id])
        else:
            self.q_estimates[arm_id] += (step_reward - self.q_estimates[arm_id])/self.action_counts[arm_id]
        
    def reset(self):
        self.time = 0
        self.action_counts = np.zeros((self.num_bandits))
        self.true_action_val = np.array([self.real_reward for i in range(self.num_bandits)])
        self.true_action_val = np.random.normal(size = self.num_bandits) + self.true_action_val
        self.q_estimates = np.array([self.init_estimates for i in range(self.num_bandits)], dtype = np.float64)
        self.best_action = np.argmax(self.true_action_val)
        #print(self.true_action_val)
        #print("x"*20)
        
    def non_stationary(self):
        self.true_action_val = np.random.normal(loc = self.time_mean, scale=self.time_variance, size = self.num_bandits) + self.true_action_val
        self.best_action = np.argmax(self.true_action_val)
        
    def start(self, runs = 100):
        rewards = np.zeros((runs, self.steps), dtype = np.float64)
        best_action_counts = np.zeros((runs, self.steps), dtype = np.float64)
        for rid in range(runs):
            print(rid)
            self.reset()
            for tstep in range(self.steps):
                arm_id = self.select()
                step_reward = self.get_reward(arm_id)
                rewards[rid, tstep] = step_reward
                self.update(arm_id, step_reward)
                if(rid == self.best_action):
                    #print(tstep)
                    #print(rid, np.argmax(self.true_action_val), np.max(self.true_action_val))
                    best_action_counts[rid, tstep] = 1
                #self.non_stationary()
        return rewards.mean(axis = 0), best_action_counts.mean(axis = 0)
    
    

In [30]:
if __name__ == "__main__":
    t1 = Testbed(use_alpha = True)
    rewards1, best_action1 = t1.start()
    t2 = Testbed(use_alpha = False)
    rewards2, best_action2 = t2.start()
    plt.figure(figsize = (10,20))
    plt.subplot(2,1,1)
    plt.plot(rewards1, label = "alpha")
    plt.plot(rewards2, label = "sample mean")
    plt.xlabel("steps")
    plt.ylabel("avg. reward")
    plt.legend()
    
    plt.subplot(2,1,2)
    plt.plot(best_action1, label = "alpha")
    plt.plot(best_action2, label = "sample mean")
    plt.xlabel("steps")
    plt.ylabel("% optimal action")
    plt.legend()
    
    plt.savefig("ex2_5.png")
    plt.close()

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
