/
accelerated_gradient_descent.jl
160 lines (131 loc) · 5.02 KB
/
accelerated_gradient_descent.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# http://stronglyconvex.com/blog/accelerated-gradient-descent.html
# TODO: Need to specify alphamax on each iteration
# Flip notation relative to Duckworth
# Start with x_{0}
# y_{t} = x_{t - 1} - alpha g(x_{t - 1})
# If converged, return y_{t}
# x_{t} = y_{t} + (t - 1.0) / (t + 2.0) * (y_{t} - y_{t - 1})
macro agdtrace()
quote
if tracing
dt = Dict()
if o.extended_trace
dt["x"] = copy(x)
dt["g(x)"] = copy(g)
end
g_norm = vecnorm(g, Inf)
update!(tr,
iteration,
f_x,
g_norm,
dt,
o.store_trace,
o.show_trace,
o.show_every,
o.callback)
end
end
end
immutable AcceleratedGradientDescent <: Optimizer
linesearch!::Function
end
AcceleratedGradientDescent(; linesearch!::Function = hz_linesearch!) =
AcceleratedGradientDescent(linesearch!)
function optimize{T}(d::DifferentiableFunction,
initial_x::Vector{T},
mo::AcceleratedGradientDescent,
o::OptimizationOptions)
# Print header if show_trace is set
print_header(o)
# Maintain current state in x and previous state in x_previous
x, x_previous = copy(initial_x), copy(initial_x)
# Maintain current intermediate state in y and previous intermediate state in y_previous
y, y_previous = copy(initial_x), copy(initial_x)
# Count the total number of iterations
iteration = 0
# Track calls to function and gradient
f_calls, g_calls = 0, 0
# Count number of parameters
n = length(x)
# Maintain current gradient in g
g = Array(T, n)
# The current search direction
s = Array(T, n)
# Buffers for use in line search
x_ls, g_ls = Array(T, n), Array(T, n)
# Store f(x) in f_x
f_x_previous, f_x = NaN, d.fg!(x, g)
f_calls, g_calls = f_calls + 1, g_calls + 1
# Keep track of step-sizes
alpha = alphainit(one(T), x, g, f_x)
# TODO: How should this flag be set?
mayterminate = false
# Maintain a cache for line search results
lsr = LineSearchResults(T)
# Trace the history of states visited
tr = OptimizationTrace(mo)
tracing = o.store_trace || o.show_trace || o.extended_trace || o.callback != nothing
@agdtrace
# Assess types of convergence
x_converged, f_converged, g_converged = false, false, false
# Iterate until convergence
converged = false
while !converged && iteration < o.iterations
# Increment the number of steps we've had to perform
iteration += 1
# Search direction is always the negative gradient
@simd for i in 1:n
@inbounds s[i] = -g[i]
end
# Refresh the line search cache
dphi0 = vecdot(g, s)
clear!(lsr)
push!(lsr, zero(T), f_x, dphi0)
# Determine the distance of movement along the search line
alpha, f_update, g_update =
mo.linesearch!(d, x, s, x_ls, g_ls, lsr, alpha, mayterminate)
f_calls, g_calls = f_calls + f_update, g_calls + g_update
# Make one move in the direction of the gradient
copy!(y_previous, y)
@simd for i in 1:n
@inbounds y[i] = x_previous[i] + alpha * s[i]
end
# Record previous state
copy!(x_previous, x)
# Update current position with Nesterov correction
scaling = (iteration - 1) / (iteration + 2)
@simd for i in 1:n
@inbounds x[i] = y[i] + scaling * (y[i] - y_previous[i])
end
# Update the function value and gradient
f_x_previous, f_x = f_x, d.fg!(x, g)
f_calls, g_calls = f_calls + 1, g_calls + 1
x_converged,
f_converged,
g_converged,
converged = assess_convergence(x,
x_previous,
f_x,
f_x_previous,
g,
o.x_tol,
o.f_tol,
o.g_tol)
@agdtrace
end
return MultivariateOptimizationResults("Accelerated Gradient Descent",
initial_x,
x,
Float64(f_x),
iteration,
iteration == o.iterations,
x_converged,
o.x_tol,
f_converged,
o.f_tol,
g_converged,
o.g_tol,
tr,
f_calls,
g_calls)
end