/
truncated_conjugate_gradient_descent.jl
199 lines (190 loc) · 8.45 KB
/
truncated_conjugate_gradient_descent.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
@doc raw"""
truncated_conjugate_gradient_descent(M, F, ∇F, x, η, H, Δ)
solve the trust-region subproblem
```math
\operatorname*{arg\,min}_{\eta ∈ T_{x}M} m_{x}(\eta) = F(x) + \langle \nabla F(x), \eta \rangle_{x} + \frac{1}{2} \langle \operatorname{Hess}[F](\eta)_ {x}, \eta \rangle_{x}
```
```math
\text{s.t.} \; \langle \eta, \eta \rangle_{x} \leqq {\Delta}^2
```
with the Steihaug-Toint truncated conjugate-gradient method.
For a description of the algorithm and theorems offering convergence guarantees,
see the reference:
* P.-A. Absil, C.G. Baker, K.A. Gallivan,
Trust-region methods on Riemannian manifolds, FoCM, 2007.
doi: [10.1007/s10208-005-0179-9](https://doi.org/10.1007/s10208-005-0179-9)
* A. R. Conn, N. I. M. Gould, P. L. Toint, Trust-region methods, SIAM,
MPS, 2000. doi: [10.1137/1.9780898719857](https://doi.org/10.1137/1.9780898719857)
# Input
* `M` – a manifold $\mathcal M$
* `F` – a cost function $F\colon\mathcal M\to\mathbb R$ to minimize
* `∇F` – the gradient $\nabla F\colon\mathcal M\to T\mathcal M$ of F
* `x` – a point on the manifold $x ∈ \mathcal M$
* `η` – an update tangential vector $\eta ∈ \mathcal{T_{x}M}$
* `H` – the hessian $H( \mathcal M, x, \xi)$ of F
* `Δ` – a trust-region radius
# Optional
* `preconditioner` – a preconditioner for the hessian H
* `θ` – 1+θ is the superlinear convergence target rate. The algorithm will
terminate early if the residual was reduced by a power of 1+theta.
* `κ` – the linear convergence target rate: algorithm will terminate
early if the residual was reduced by a factor of kappa.
* `useRandom` – set to true if the trust-region solve is to be initiated with a
random tangent vector. If set to true, no preconditioner will be
used. This option is set to true in some scenarios to escape saddle
points, but is otherwise seldom activated.
* `stopping_criterion` – ([`StopWhenAny`](@ref), [`StopAfterIteration`](@ref),
[`stopIfResidualIsReducedByFactor`](@ref), [`stopIfResidualIsReducedByPower`](@ref),
[`StopWhenCurvatureIsNegative`](@ref), [`StopWhenTrustRegionIsExceeded`](@ref) )
a functor inheriting from [`StoppingCriterion`](@ref) indicating when to stop,
where for the default, the maximal number of iterations is set to the dimension of the
manifold, the power factor is `θ`, the reduction factor is `κ`.
.
* `return_options` – (`false`) – if actiavated, the extended result, i.e. the
complete [`Options`](@ref) re returned. This can be used to access recorded values.
If set to false (default) just the optimal value `xOpt` is returned
and the ones that are passed to [`decorate_options`](@ref) for decorators.
# Output
* `η` – an approximate solution of the trust-region subproblem in
$\mathcal{T_{x}M}$.
OR
* `options` - the options returned by the solver (see `return_options`)
# see also
[`trust_regions`](@ref)
"""
function truncated_conjugate_gradient_descent(
M::mT,
F::TF,
∇F::TdF,
x,
η,
H::Union{Function,Missing},
Δ::Float64;
preconditioner::Tprec = (M, x, ξ) -> ξ,
θ::Float64 = 1.0,
κ::Float64 = 0.1,
useRandom::Bool = false,
stopping_criterion::StoppingCriterion = StopWhenAny(
StopAfterIteration(manifold_dimension(M)),
stopIfResidualIsReducedByPower(
sqrt(inner(
M,
x,
∇F(M, x) + (useRandom ? H(M, x, η) : zero_tangent_vector(M, x)),
∇F(M, x) + (useRandom ? H(M, x, η) : zero_tangent_vector(M, x)),
)),
θ,
),
stopIfResidualIsReducedByFactor(
sqrt(inner(
M,
x,
∇F(M, x) + (useRandom ? H(M, x, η) : zero_tangent_vector(M, x)),
∇F(M, x) + (useRandom ? H(M, x, η) : zero_tangent_vector(M, x)),
)),
κ,
),
StopWhenTrustRegionIsExceeded(),
StopWhenCurvatureIsNegative(),
),
return_options = false,
kwargs..., #collect rest
) where {mT<:Manifold,TF,TdF,Tprec}
p = HessianProblem(M, F, ∇F, H, preconditioner)
o = TruncatedConjugateGradientOptions(
x,
stopping_criterion,
η,
zero_tangent_vector(M, x),
Δ,
zero_tangent_vector(M, x),
useRandom,
)
o = decorate_options(o; kwargs...)
resultO = solve(p, o)
if return_options
resultO
else
return get_solver_result(resultO)
end
end
function initialize_solver!(
p::P,
o::O,
) where {P<:HessianProblem,O<:TruncatedConjugateGradientOptions}
o.η = o.useRand ? o.η : zero_tangent_vector(p.M, o.x)
Hη = o.useRand ? getHessian(p, o.x, o.η) : zero_tangent_vector(p.M, o.x)
o.residual = get_gradient(p, o.x) + Hη
# Initial search direction (we maintain -delta in memory, called mdelta, to
# avoid a change of sign of the tangent vector.)
return o.δ = o.useRand ? o.residual : get_preconditioner(p, o.x, o.residual)
# If the Hessian or a linear Hessian approximation is in use, it is
# theoretically guaranteed that the model value decreases strictly
# with each iteration of tCG. Hence, there is no need to monitor the model
# value. But, when a nonlinear Hessian approximation is used (such as the
# built-in finite-difference approximation for example), the model may
# increase. It is then important to terminate the tCG iterations and return
# the previous (the best-so-far) iterate. The variable below will hold the
# model value.
# o.model_value = o.useRand ? 0 : inner(p.M,o.x,o.η,get_gradient(p,o.x)) + 0.5 * inner(p.M,o.x,o.η,Hη)
end
function step_solver!(
p::P,
o::O,
iter,
) where {P<:HessianProblem,O<:TruncatedConjugateGradientOptions}
ηOld = o.η
δOld = o.δ
z = o.useRand ? o.residual : get_preconditioner(p, o.x, o.residual)
# this is not correct, it needs to be the inverse of the preconditioner!
zrOld = inner(p.M, o.x, z, o.residual)
HηOld = getHessian(p, o.x, ηOld)
# This call is the computationally expensive step.
Hδ = getHessian(p, o.x, δOld)
# Compute curvature (often called kappa).
δHδ = inner(p.M, o.x, δOld, Hδ)
# Note that if d_Hd == 0, we will exit at the next "if" anyway.
α = zrOld / δHδ
# <neweta,neweta>_P =
# <eta,eta>_P + 2*alpha*<eta,delta>_P + alpha*alpha*<delta,delta>_P
e_Pd = -inner(p.M, o.x, ηOld, o.useRand ? δOld : get_preconditioner(p, o.x, δOld)) # It must be clarified if it's negative or not
d_Pd = inner(p.M, o.x, δOld, o.useRand ? δOld : get_preconditioner(p, o.x, δOld))
e_Pe = inner(p.M, o.x, ηOld, o.useRand ? ηOld : get_preconditioner(p, o.x, ηOld))
e_Pe_new = e_Pe + 2α * e_Pd + α^2 * d_Pd # vielleicht müssen doch die weiteren Optionen gespeichert werden
# Check against negative curvature and trust-region radius violation.
# If either condition triggers, we bail out.
if δHδ <= 0 || e_Pe_new >= o.Δ^2
tau = (-e_Pd + sqrt(e_Pd^2 + d_Pd * (o.Δ^2 - e_Pe))) / d_Pd
o.η = ηOld - tau * (δOld)
else
# No negative curvature and eta_prop inside TR: accept it.
o.η = ηOld - α * (δOld)
# Verify that the model cost decreased in going from eta to new_eta. If
# it did not (which can only occur if the Hessian approximation is
# nonlinear or because of numerical errors), then we return the
# previous eta (which necessarily is the best reached so far, according
# to the model cost). Otherwise, we accept the new eta and go on.
# -> Stopping Criterion
old_model_value =
inner(p.M, o.x, ηOld, get_gradient(p, o.x)) + 0.5 * inner(p.M, o.x, ηOld, HηOld)
new_model_value =
inner(p.M, o.x, o.η, get_gradient(p, o.x)) +
0.5 * inner(p.M, o.x, o.η, getHessian(p, o.x, o.η))
if new_model_value >= old_model_value
o.η = ηOld
end
end
# Update the residual.
o.residual = o.residual - α * Hδ
# Precondition the residual.
# It's actually the inverse of the preconditioner in o.residual
z = o.useRand ? o.residual : get_preconditioner(p, o.x, o.residual)
# this is not correct, it needs to be the inverse of the preconditioner!
# Save the old z'*r.
# Compute new z'*r.
zr = inner(p.M, o.x, z, o.residual)
# Compute new search direction.
β = zr / zrOld
return o.δ = project(p.M, o.x, z + β * o.δ)
end
get_solver_result(o::O) where {O<:TruncatedConjugateGradientOptions} = o.η