-
Notifications
You must be signed in to change notification settings - Fork 0
/
Q_sigma.py
195 lines (166 loc) · 7.82 KB
/
Q_sigma.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
from pylab import zeros, argmax, size, unique, inf, uniform, float64, randint
class QSigma:
def __init__(self, env, function_approximator, eps=0.1, gam=1, alph=0.1, sig=1, n=1, beta=1, initial_theta=None):
self.epsilon = eps
self.gamma = gam
self.sigma = sig
self.n = n
self.beta = beta
# Environment
self.env = env
# Function approximator
self.function_approximator = function_approximator
self.alpha = alph * self.function_approximator.get_alpha_factor()
# This is the probability of taking the optimal action
self.optimal_p = (1-self.epsilon) + (self.epsilon / self.env.get_num_actions())
# Episode Number: this is the number of training episodes that the agent has received so far
self.episode_number = 0
# Return per episode: this is the return per episode that the agent has received
self.return_per_episode = []
""" An initial theta can be provided. Otherwise it uses a random initialization """
if initial_theta is not None:
self.function_approximator.set_theta(initial_theta)
""" Setters and Getters for Epsilon, Sigma, n, and Alpha """
# Epsilon
def get_epsilon(self):
return self.epsilon
def set_epsilon(self, new_epsilon):
self.epsilon = new_epsilon
# Sigma
def get_sigma(self):
return self.sigma
def set_sigma(self, new_sigma):
self.sigma = new_sigma
# n
def get_n(self):
return self.n
def set_n(self, new_n):
self.n = new_n
# Alpha
def get_alpha(self):
return self.alpha
def set_alpha(self, new_alpha):
self.alpha = new_alpha
# Episode Number
def increase_episode_number(self):
self.episode_number += 1
# Add Return
def add_return(self, return_value):
self.return_per_episode.append(return_value)
# Here q refers to the action value function. This function returns the approximated
# action-value function at a given state
def get_q(self, state):
q = zeros(self.env.get_num_actions(), dtype=float64)
for action in range(0, self.env.get_num_actions()):
q[action] = self.function_approximator.get_value(state, action)
return q
# This function returns an action according to an epsilon greedy policy
def epsilon_greedy_action(self, state):
p = uniform()
if p < self.epsilon:
return randint(0, self.env.get_num_actions())
else:
return argmax(self.get_q(state))
# This function returns the probability of a given action in a given state according to an epsilon greedy policy
def epsilon_greedy_probability(self, state, action):
q = self.get_q(state)
if size(unique(q)) < self.env.get_num_actions():
max_q = max(q)
max_observations = 0
for value in q:
if value == max_q: max_observations += 1
probabilities = zeros(size(q))
for i in range(size(q)):
if q[i] == max_q: probabilities[i] = ((1-self.epsilon) / max_observations) + \
(self.epsilon / self.env.get_num_actions())
else: probabilities[i] = self.epsilon / self.env.get_num_actions()
return probabilities[action]
else:
if action == argmax(q):
return self.optimal_p
else:
return self.epsilon / self.env.get_num_actions()
# Returns the average over all actions of the action-value function for a given state
def average_q(self, state):
q = self.get_q(state)
average_q = float64(0)
for action in range(0, self.env.get_num_actions()):
p = self.optimal_p if action == argmax(q) else (self.epsilon / self.env.get_num_actions())
average_q += p * q[action]
return average_q
# This function trains Q-sigma over a given amount of episodes. Note that train(1) + train(1) = train(2)
def train(self, num_episodes):
if num_episodes == 0: return
rewards_per_episode = zeros(num_episodes)
Actions = zeros(self.n+1, dtype=int)
States = [[] for _ in range(self.n + 1)]
Q = zeros(self.n+1)
Delta = zeros(self.n)
Pi = zeros(self.n)
Sigma = zeros(self.n)
for episode in range(0,num_episodes):
self.increase_episode_number()
S = self.env.init_state()
A = self.epsilon_greedy_action(S)
Reward_Sum = 0
T = inf
t = 0
States[t % (self.n+1)] = S
Actions[t % (self.n+1)] = A
Q[t % (self.n+1)] = self.function_approximator.get_value(S, A)
while 1:
if t < T:
R, new_S = self.env.do_action(S, A)
States[(t + 1) % (self.n + 1)] = new_S
Reward_Sum += R
if new_S is None:
T = t + 1
Delta[t % self.n] = R - self.function_approximator.get_value(States[t % (self.n+1)],
Actions[t % (self.n+1)])
else:
Sigma[t % self.n] = self.sigma
new_A = self.epsilon_greedy_action(new_S)
Actions[(t+1) % (self.n+1)] = new_A
# Qt = self.function_approximator.get_value(S, A)
# Qtplus1 = self.function_approximator.get_value(new_S, new_A)
# Delta[t % self.n] = R + (self.gamma * self.sigma * Qtplus1) + \
# (self.gamma * (1-self.sigma) * self.average_q(new_S)) - Qt
Q[(t+1) % (self.n+1)] = self.function_approximator.get_value(new_S, new_A) #New
Delta[t % self.n] = R + (self.gamma * self.sigma * Q[(t+1) % (self.n+1)]) + \
(self.gamma * (1-self.sigma) * self.average_q(new_S)) - Q[t % (self.n+1)] # New
Pi[t % self.n] = self.epsilon_greedy_probability(new_S, new_A)
S = new_S
A = new_A
Tao = t - self.n + 1
if Tao >= 0:
E = 1
# G = self.function_approximator.get_value(States[Tao % (self.n+1)], Actions[Tao % (self.n+1)])
G = Q[Tao % (self.n+1)] # New
for k in range(Tao, min(Tao + self.n, T)):
G += E * Delta[k % self.n]
#if (k % self.n) < size(Pi):
E = self.gamma * E * ((1-self.sigma) * Pi[k % self.n] + self.sigma)
Qtao = self.function_approximator.get_value(States[Tao % (self.n+1)],
Actions[Tao % (self.n+1)])
self.function_approximator.update(States[Tao % (self.n+1)], Actions[Tao % (self.n+1)],
self.alpha * (G - Qtao))
t += 1
if Tao == T - 1: break
rewards_per_episode[episode] = Reward_Sum
self.add_return(Reward_Sum)
self.set_sigma(self.sigma * self.beta)
return rewards_per_episode
def test(self, num_episodes):
if num_episodes == 0: return
Reward_Sum = 0
for episode in range(0,num_episodes):
S = self.env.init_state()
A = self.epsilon_greedy_action(S)
while 1:
R, new_S = self.env.do_action(S, A)
Reward_Sum += R
if new_S is None: break
else:
A = self.epsilon_greedy_action(new_S)
S = new_S
return Reward_Sum / num_episodes