diff --git a/.gitignore b/.gitignore index 90debbd0e..e98d4238e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ benchmarks/graphs/* *~ *.kate-swp +docs/build/ +docs/site/ diff --git a/.travis.yml b/.travis.yml index 3ebb0e69c..abaf285be 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,3 +13,5 @@ script: - julia -e 'Pkg.clone(pwd()); Pkg.test("Optim", coverage=true)' after_success: - julia -e 'cd(Pkg.dir("Optim")); Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder())' + - julia -e 'Pkg.add("Documenter")' + - julia -e 'cd(Pkg.dir("Optim")); include(joinpath("docs", "make.jl"))' diff --git a/NEWS.md b/NEWS.md index c13f4b637..3785f082e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,5 @@ # Optim v0.6.0 release notes -* Added NEWS.md +* Added documentation generated by [Documenter.jl](https://github.com/JuliaDocs/Documenter.jl), see PR[225](https://github.com/JuliaOpt/Optim.jl/pull/225). +* Fixed bug in ConjugteGradient direction reset step, see issue [209](https://github.com/JuliaOpt/Optim.jl/issues/209) +* Added NEWS.md diff --git a/README.md b/README.md index 50f7c37d9..b35c1b4d0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ + +[![](https://img.shields.io/badge/docs-latest-blue.svg)](https://juliaopt.github.io/Optim.jl/latest) + *This it the development branch of Optim.jl. Please visit [this branch](https://github.com/JuliaOpt/Optim.jl/tree/v0.4.5) to find the README.md belonging to the latest official release of Optim.jl* + Optim.jl ======== diff --git a/docs/.documenter.enc b/docs/.documenter.enc new file mode 100644 index 000000000..5dc57d2e3 Binary files /dev/null and b/docs/.documenter.enc differ diff --git a/docs/make.jl b/docs/make.jl new file mode 100644 index 000000000..6c110db26 --- /dev/null +++ b/docs/make.jl @@ -0,0 +1,12 @@ +using Documenter, OptimDoc + +# use include("Rosenbrock.jl") etc + +# assuming linux. +#run('mv ../LICENSE.md ./LICENSE.md') +#run('mv ../CONTRIBUTING.md ./dev/CONTRIBUTING.md') +makedocs() + +deploydocs( + repo = "github.com/JuliaOpt/Optim.jl.git" +) diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml new file mode 100644 index 000000000..7214e47d0 --- /dev/null +++ b/docs/mkdocs.yml @@ -0,0 +1,51 @@ +site_name: Optim.jl +repo_url: https://github.com/JuliaOpt/Optim.jl/ +site_description: Pure Julia implementations of optimization algorithms. +site_author: JuliaOpt + +theme: readthedocs + + +extra: + palette: + primary: 'indigo' + accent: 'blue' + +markdown_extensions: + - codehilite + - extra + - tables + - fenced_code + +extra_javascript: + - https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS_HTML + - assets/mathjaxhelper.js + +docs_dir: 'build' + +pages: + - Home: 'index.md' + - General information: + - Minimizing a function: 'user/minimization.md' + - Configurable Options: 'user/config.md' + - Tips and tricks: 'user/tipsandtricks.md' + - Planned Changes: 'user/planned.md' + - Algorithms: + - Solvers: + - Gradient Free: + - Nelder Mead: 'algo/nelder_mead.md' + - Simulated Annealing: 'algo/simulated_annealing.md' +# - Univariate: +# - Brent's Method: 'algo/brent.md' +# - Golden Section: 'algo/goldensection.md' + - Gradient Required: +# - 'Conjugate Gradient': 'algo/conjugategradient.md' + - 'Gradient Descent': 'algo/gradientdescent.md' + - '(L-)BFGS': 'algo/lbfgs.md' + - Hessian Required: + - Newton: 'algo/newton.md' + - Linesearch: 'algo/linesearch.md' + - Preconditioners: 'algo/precondition.md' + - 'Contributing': + - 'Contributing': 'dev/contributing.md' + - License: 'LICENSE.md' diff --git a/docs/src/.Rhistory b/docs/src/.Rhistory new file mode 100644 index 000000000..e69de29bb diff --git a/docs/src/LICENSE.md b/docs/src/LICENSE.md new file mode 100644 index 000000000..2daa4e88b --- /dev/null +++ b/docs/src/LICENSE.md @@ -0,0 +1,9 @@ +Optim.jl is licensed under the MIT License: + + Copyright (c) 2012: John Myles White and other contributors. + + Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/docs/src/algo/brent.md b/docs/src/algo/brent.md new file mode 100644 index 000000000..28308a87f --- /dev/null +++ b/docs/src/algo/brent.md @@ -0,0 +1,7 @@ +# Brent's Method +## Constructor + +## Description +## Example +## References +R. P. Brent (2002) Algorithms for Minimization Without Derivatives. Dover edition. diff --git a/docs/src/algo/conjugategradient.md b/docs/src/algo/conjugategradient.md new file mode 100644 index 000000000..5819ac07f --- /dev/null +++ b/docs/src/algo/conjugategradient.md @@ -0,0 +1,14 @@ +# Conjugate Gradient Descent +## Constructor +```julia +ConjugateGradient(; linesearch! = hz_linesearch!, + eta = 0.4, + P = nothing, + precondprep! = (P, x) -> nothing) +``` + +## Description + +## Example +## References +W. W. Hager and H. Zhang (2006) Algorithm 851: CG_DESCENT, a conjugate gradient method with guaranteed descent. ACM Transactions on Mathematical Software 32: 113-137. diff --git a/docs/src/algo/goldensection.md b/docs/src/algo/goldensection.md new file mode 100644 index 000000000..b85adf533 --- /dev/null +++ b/docs/src/algo/goldensection.md @@ -0,0 +1,5 @@ +# Golden Section +## Constructor +## Description +## Example +## References diff --git a/docs/src/algo/gradientdescent.md b/docs/src/algo/gradientdescent.md new file mode 100644 index 000000000..bdb624fcd --- /dev/null +++ b/docs/src/algo/gradientdescent.md @@ -0,0 +1,29 @@ +# Gradient Descent +## Constructor +```julia +GradientDescent(; linesearch!::Function = hz_linesearch!, + P = nothing, + precondprep! = (P, x) -> nothing) +``` +## Description +Gradient Descent a common name for a quasi-Newton solver. This means that it takes +steps according to + +$ x_{n+1} = x_n - P^{-1}\nabla f(x_n)$ + +where $P$ is a positive definite matrix. If $P$ is the Hessian, we get Newton's method. +In Gradient Descent, $P$ is simply an appropriately dimensioned identity matrix, +such that we go in the exact opposite direction of the gradient. This means +that we do not use the curvature information from the Hessian, or an approximation +of it. While it does seem quite logical to go in the opposite direction of the fastest +increase in objective value, the procedure can be very slow if the problem is ill-conditioned. +See the section on preconditioners for ways to remedy this when using Gradient Descent. + +As with the other quasi-Newton solvers in this package, a scalar $\alpha$ is introduced +as follows + +$ x_{n+1} = x_n - \alpha P^{-1}\nabla f(x_n)$ + +and is chosen by a linesearch algorithm such that each step gives sufficient descent. +## Example +## References diff --git a/docs/src/algo/index.md b/docs/src/algo/index.md new file mode 100644 index 000000000..bcdc8fc4c --- /dev/null +++ b/docs/src/algo/index.md @@ -0,0 +1 @@ +# Solvers diff --git a/docs/src/algo/lbfgs.md b/docs/src/algo/lbfgs.md new file mode 100644 index 000000000..9fd80d8ca --- /dev/null +++ b/docs/src/algo/lbfgs.md @@ -0,0 +1,43 @@ +# (L-)BFGS +This page contains information about BFGS and its limited memory version L-BFGS. +## Constructors +```julia +BFGS(; linesearch! = hz_linesearch!, + P = nothing, + precondprep! = (P, x) -> nothing) +``` + +```julia +LBFGS(; m = 10, + linesearch! = hz_linesearch!, + P = nothing, + precondprep! = (P, x) -> nothing) +``` +## Description +This means that it takes steps according to + +$ x_{n+1} = x_n - P^{-1}\nabla f(x_n)$ + +where $P$ is a positive definite matrix. If $P$ is the Hessian, we get Newton's method. +In (L-)BFGS, the matrix is an approximation to the Hessian built using differences +in the gradient across iterations. As long as the initial matrix is positive definite + it is possible to show that all the follow matrices will be as well. The starting +matrix could simply be the identity matrix, such that the first step is identical +to the Gradient Descent algorithm, or even the actual Hessian. + +There are two versions of BFGS in the package: BFGS, and L-BFGS. The latter is different +from the former because it doesn't use a complete history of the iterative procedure to +construct $P$, but rather only the latest $m$ steps. It doesn't actually build the Hessian +approximation matrix either, but computes the direction directly. This makes more suitable for +large scale problems, as the memory requirement to store the relevant vectors will +grow quickly in large problems. + +As with the other quasi-Newton solvers in this package, a scalar $\alpha$ is introduced +as follows + +$ x_{n+1} = x_n - \alpha P^{-1}\nabla f(x_n)$ + +and is chosen by a linesearch algorithm such that each step gives sufficient descent. +## Example +## References +Wright, Stephen, and Jorge Nocedal (2006) "Numerical optimization." Springer diff --git a/docs/src/algo/linesearch.md b/docs/src/algo/linesearch.md new file mode 100644 index 000000000..a7b565a4c --- /dev/null +++ b/docs/src/algo/linesearch.md @@ -0,0 +1,17 @@ +# Line search +## Description + +### Available line search algorithms + +* `hz_linesearch!` , the default line search algorithm +* `backtracking_linesearch!` +* `interpolating_linesearch!` +* `mt_linesearch!` + +The default line search algorithm is taken from the Conjugate Gradient implementation +by Hager and Zhang (HZ). + +## Example +## References +W. W. Hager and H. Zhang (2006) "Algorithm 851: CG_DESCENT, a conjugate gradient method with guaranteed descent."" ACM Transactions on Mathematical Software 32: 113-137. +Wright, Stephen, and Jorge Nocedal (2006) "Numerical optimization." Springer diff --git a/docs/src/algo/nelder_mead.md b/docs/src/algo/nelder_mead.md new file mode 100644 index 000000000..e2c980459 --- /dev/null +++ b/docs/src/algo/nelder_mead.md @@ -0,0 +1,38 @@ +# Nelder-Mead +Nelder-Mead is currently the standard algorithm when no derivatives are provided. +## Constructor +```julia +NelderMead(; a = 1.0, + g = 2.0, + b = 0.5) +``` +## Description +Our current implementation of the Nelder-Mead algorithm follows the original implementation +very closely, see Nelder and Mead (1965). This means that there is scope for improvement, but +also that it should be quite clear what is going on in the code relative to the original paper. + +Instead of using gradient information, we keep track of the function value at a number +of points in the search space. Together, the points form a simplex. Given a simplex, +we can perform one of four actions: reflect, expand, contract, or shrink. Basically, +the goal is to iteratively replace the worst point with a better point. More information +can be found in Nelder and Mead (1965) or Gao and Han (2010). + +The stopping rule is the same as in the original paper, and is basically the standard +error of the function values at the vertices. To set the tolerance level for this +convergence criterion, set the `g_tol` level as described in the Configurable Options +section. + +When the solver finishes, we return a minimizer which is either the centroid or one of the vertices. +The function value at the centroid adds a function evaluation, as we need to evaluate the objection +at the centroid to choose the smallest function value. Howeever, even if the function value at the centroid can be returned +as the minimum, we do not trace it during the optimization iterations. This is to avoid +too many evaluations of the objective function which can be computationally expensive. +Typically, there should be no more than twice as many `f_calls` than `iterations`, +and adding an evaluation at the centroid when `tracing` could considerably increase the total +run-time of the algorithm. + +## Example + +## References +Nelder, John A. and R. Mead (1965). "A simplex method for function minimization". Computer Journal 7: 308–313. doi:10.1093/comjnl/7.4.308. +Gao, Fuchang and Lixing Han (2010). "Implementing the Nelder-Mead simplex algorithm with adaptive parameters". Computational Optimization and Applications [DOI 10.1007/s10589-010-9329-3] diff --git a/docs/src/algo/newton.md b/docs/src/algo/newton.md new file mode 100644 index 000000000..84fc64972 --- /dev/null +++ b/docs/src/algo/newton.md @@ -0,0 +1,59 @@ +# Newton's Method +## Constructor +```julia +Newton(; linesearch! = hz_linesearch!) +``` + +The constructor takes one keyword + +* `linesearch! = a(d, x, p, x_new, g_new, lsr, c, mayterminate)`, a function performing line search, see the line search section. + +## Description +Newton's method for optimization has a long history, and is in some sense the +gold standard in unconstrained optimization of smooth functions, at least from a theoretical viewpoint. +The main benefit is that it has a quadratic rate of convergence near a local optimum. The main +disadvantage is that the user has to provide a Hessian. This can be difficult, complicated, or simply annoying. +It can also be computationally expensive to calculate it. + +Newton's method for optimization consists of applying Newton's method for solving +systems of equations, where the equations are the first order conditions, saying +that the gradient should equal the zero vector. + +$ \nabla f(x) = 0 $ + +A second order Taylor expansion of the left-hand side leads to the iterative scheme + +$ x_{n+1} = x_n - H(x_n)^{-1}\nabla f(x_n)$ + +where the inverse is not calculated directly, but the step size is instead calculated by solving + +$ H(x) \textbf{s} = \nabla f(x_n) $. + +This is equivalent to minimizing a quadratic model, $m_k$ around the current $x_n$ + +$ m_k(s) = f(x_n) + \nabla f(x_n)^\top \textbf{s} + \frac{1}{2} \textbf{s}^\top H(x_n) \textbf{s} $ + +For functions where $H(x_n)$ is difficult, or computationally expensive to obtain, we might +replace the Hessian with another positive definite matrix that approximates it. +Such methods are called Quasi-Newton methods; see (L-)BFGS and Gradient Descent. + +In a sufficiently small neighborhood around the minimizer, Newton's method has +quadratic convergence, but globally it might have slower convergence, or it might +even diverge. To ensure convergence, a line search is performed for each $\textbf{s}$. +This amounts to replacing the step formula above with + +$ x_{n+1} = x_n - \alpha \textbf{s}$ + +and finding a scalar $\alpha$ such that we get sufficient descent; see the line search section for more information. + +Additionally, if the function is locally +concave, the step taken in the formulas above will go in a direction of ascent, + as the Hessian will not be positive (semi)definite. +To avoid this, we use a specialized method to calculate the step direction. If +the Hessian is positive semidefinite then the method used is standard, but if +it is not, a correction is made using the functionality in [PositiveFactorizations.jl](https://github.com/timholy/PositiveFactorizations.jl). + +## Example +show the example from the issue + +## References diff --git a/docs/src/algo/plap.png b/docs/src/algo/plap.png new file mode 100644 index 000000000..cadfff132 Binary files /dev/null and b/docs/src/algo/plap.png differ diff --git a/docs/src/algo/precondition.md b/docs/src/algo/precondition.md new file mode 100644 index 000000000..eab534878 --- /dev/null +++ b/docs/src/algo/precondition.md @@ -0,0 +1,58 @@ +# Preconditioning + +The `GradientDescent`, `ConjugateGradient` and `LBFGS` methods support preconditioning. A preconditioner +can be thought of as a change of coordinates under which the Hessian is better conditioned. With a +good preconditioner substantially improved convergence is possible. + +A preconditioner `P`can be of any type as long as the following two methods are +implemented: + +* `A_ldiv_B!(pgr, P, gr)` : apply `P` to a vector `gr` and store in `pgr` + (intuitively, `pgr = P \ gr`) +* `dot(x, P, y)` : the inner product induced by `P` + (intuitively, `dot(x, P * y)`) + +Precisely what these operations mean, depends on how `P` is stored. Commonly, we store a matrix `P` which +approximates the Hessian in some vague sense. In this case, + +* `A_ldiv_B!(pgr, P, gr) = copy!(pgr, P \ A)` +* `dot(x, P, y) = dot(x, P * y)` + +Finally, it is possible to update the preconditioner as the state variable `x` +changes. This is done through `precondprep!` which is passed to the +optimizers as kw-argument, e.g., +```jl + method=ConjugateGradient(P = precond(100), precondprep! = precond(100)) +``` +though in this case it would always return the same matrix. +(See `fminbox.jl` for a more natural example.) + +Apart from preconditioning with matrices, `Optim.jl` provides +a type `InverseDiagonal`, which represents a diagonal matrix by +its inverse elements. + +## Example +Below, we see an example where a function is minimized without and with a preconditioner +applied. +```jl +using ForwardDiff +plap(U; n = length(U)) = (n-1)*sum((0.1 + diff(U).^2).^2 ) - sum(U) / (n-1) +plap1 = ForwardDiff.gradient(plap) +precond(n) = spdiagm((-ones(n-1), 2*ones(n), -ones(n-1)), (-1,0,1), n, n)*(n+1) +df = DifferentiableFunction(x -> plap([0; X; 0]), + (x, g) -> copy!(g, (plap1([0; X; 0]))[2:end-1])) +result = Optim.optimize(df, zeros(100), method = ConjugateGradient(P = nothing)) +result = Optim.optimize(df, zeros(100), method = ConjugateGradient(P = precond(100))) +``` +The former optimize call converges at a slower rate than the latter. Looking at a + plot of the 2D version of the function shows the problem. + +![plap](./plap.png) + +The contours are shaped like ellipsoids, but we would rather want them to be circles. +Using the preconditioner effectively changes the coordinates such that the contours +becomes less ellipsoid-like. Benchmarking shows that using preconditioning provides + an approximate speed-up factor of 15 in this 100 dimensional case. + + +## References diff --git a/docs/src/algo/simulated_annealing.md b/docs/src/algo/simulated_annealing.md new file mode 100644 index 000000000..97d6a8084 --- /dev/null +++ b/docs/src/algo/simulated_annealing.md @@ -0,0 +1,58 @@ +# Simulated Annealing +## Constructor +```julia +SimulatedAnnealing(; neighbor! = default_neighbor!, + T = default_temperature, + p = kirkpatrick) +``` + +The constructor takes three keywords: + +* `neighbor! = a!(x_proposed, x_current)`, a mutating function of the current x, and the proposed x +* `T = b(iteration)`, a function of the current iteration that returns a temperature +* `p = c(f_proposal, f_current, T)`, a function of the current temperature, current function value and proposed function value that returns an acceptance probability + +## Description +Simulated Annealing is a derivative free method for optimization. It is based on +the Metropolis-Hastings algorithm that was originally used to generate samples +from a thermodynamics system, and is often used to generate draws from a posterior +when doing Bayesian inference. As such, it is a probabilistic method for finding +the minimum of a function, often over a quite large domains. For the historical +reasons given above, the algorithm uses terms such as cooling, temperature, and +acceptance probabilities. + +As the constructor shows, a simulated annealing implementation is characterized +by a temperature, a neighbor function, and +an acceptance probability. The temperature controls how volatile the changes in +minimizer candidates are allowed to be, as it enters the acceptance probability. +For example, the original Kirkpatrick et al. acceptance probability function can be written +as follows +```julia +p(f_proposal, f_current, T) = exp(-(f_proposal - f_current)/T) +``` +A high temperature makes it more likely that a draw is accepted, by pushing acceptance +probability to 1. As in the Metropolis-Hastings +algorithm, we always accept a smaller function value, but we also sometimes accept a +larger value. As the temperature decreases, we're more and more likely to only accept +candidate `x`'s that lowers the function value. To obtain a new `f_proposal`, we need +a neighbor function. A simple neighbor function adds a standard normal draw to each +dimension of `x` +```julia +function neighbor!(x_proposal::Array, x::Array) + for i in eachindex(x) + x_proposal[i] = x[i]+randn() + end +end +``` +As we see, it is not really possible +to disentangle the role of the different components of the algorithm. For example, both the +functional form of the acceptance function, the temperature and (indirectly) the neighbor +function determine if the next draw of `x` is accepted or not. + +The current implementation of Simulated Annealing is very rough. It lacks quite +a few features which are normally part of a proper SA implementation. +A better implementation is under way, see [this issue](https://github.com/JuliaOpt/Optim.jl/issues/200). + +## Example + +## References diff --git a/docs/src/dev/contributing.md b/docs/src/dev/contributing.md new file mode 100644 index 000000000..33616698c --- /dev/null +++ b/docs/src/dev/contributing.md @@ -0,0 +1,55 @@ +## Notes for contributing + +### Add a note in NEWS.md +If a change is more than just changing a typo, it will often be required to add +a bullet point to the `NEWS.md` file at the root of the repository. This makes it +easier for users to keep track of the changes since last version. A short description, +and a link to the PR or issue is sufficient. + +### Adding a solver +If you're contributing a new solver, you shouldn't need to touch any of the code in +`src/optimize.jl`. You should rather add a file named (`solver` is the name of the solver) +`solver.jl` in `src`, and make sure that you define a trace macro `solvertrace`, an `Optimizer` subtype +`immutable Solver <: Optimizer end` with appropriate fields, a default constructor with a keyword +for each field, and an `optimize` method. Say you want to contribute a solver called +`Minim`, then your `src/minim.jl` file would look something like + +``` +macro minimtrace() + quote + if tracing + dt = Dict() + if o.extended_trace + dt["x"] = copy(x) + dt["g(x)"] = copy(g) + dt["~inv(H)"] = copy(invH) + end + g_norm = vecnorm(g, Inf) + update!(tr, + iteration, + f_x, + g_norm, + dt, + o.store_trace, + o.show_trace, + o.show_every, + o.callback) + end + end +end + +immutable Minim{T} <: Optimizer + linesearch!::Function + minim_parameter::T +end + +Minim(; linesearch!::Function = hz_linesearch!, minim_parameter = 1.0) = + Minim(linesearch!, minim_parameter) + +function optimize{T}(d::DifferentiableFunction, + initial_x::Vector{T}, + mo::Minim, + o::OptimizationOptions) + ... +end +``` diff --git a/docs/src/dev/index.md b/docs/src/dev/index.md new file mode 100644 index 000000000..98a84fbad --- /dev/null +++ b/docs/src/dev/index.md @@ -0,0 +1 @@ +### Using Nelder Mead diff --git a/docs/src/index.md b/docs/src/index.md new file mode 100644 index 000000000..de4a0748b --- /dev/null +++ b/docs/src/index.md @@ -0,0 +1,43 @@ +# Optim.jl +## How +Optim is registered in [METADATA.jl](https://github.com/JuliaLang/METADATA.jl). +This means that all you need to do to install Optim, is to run +```julia +Pkg.add("Optim") +``` +## What +Optim is a Julia package for optimizing functions of +various kinds. While there is some support for box constrained optimization, most +of the solvers tries to find an $x$ that minimizes a function $f(x)$ without any constraints. + Thus, the main focus is on unconstrained optimization. + +## Why +There are many solvers available from both free and commercial sources, and many +of them are accessible from Julia. Few of them are written in Julia. +Performance-wise this is rarely a problem, as they are often written in either +Fortran or C. However, solvers written directly in Julia +does come with some advantages. + +When writing Julia software (packages) that require something to be optimized, the programmer +can either choose to write their own optimization routine, or use one of the many +available solvers. For example, this could be something from the [NLOpt](...) suite. +This means adding a dependency which is not written in Julia, and more assumptions +have to be made as to the environment the user is in. Does the user have the proper +compilers? Is it possible to use GPL'ed code in the project? Optim is released +under the MIT license, and installation is a simple `Pkg.add`, so it really doesn't +get much freer, easier, and lightweight than that. + +It is also true, that using a solver written in C or Fortran makes it impossible to leverage one +of the main benefits of Julia: multiple dispatch. Since Optim is entirely written +in Julia, we can currently use the dispatch system to ease the use of custom preconditioners. +A planned feature along these lines is to allow for user controlled choice of solvers +for various steps in the algorithm, entirely based on dispatch, and not predefined +possibilities chosen by the developers of Optim. + +Being a Julia package also means that Optim has access to the automatic differentiation +features through the packages in [JuliaDiff](http://www.juliadiff.org/). + +## But... +Optim is a work in progress. There are still some rough edges to be sanded down, +and features we want to implement. There are also planned breaking changes that +are good to be aware of. Please see the section on Planned Changes. diff --git a/docs/src/user/config.md b/docs/src/user/config.md new file mode 100644 index 000000000..c9d38da9f --- /dev/null +++ b/docs/src/user/config.md @@ -0,0 +1,71 @@ +## Configurable options +There are several options that simply take on some default values if the user +doensn't supply anything else than a function (and gradient) and a starting point. +### Solver options +There quite a few different solvers available in Optim, and they are all listed +below. Notice that the constructors are written without input here, but they +generally take keywords to tweak the way they work. See the pages describing each +solver for more detail. + +Requires only a function handle: + +* `NelderMead()` +* `SimulatedAnnealing()` + +Requires a function and gradient (will be approximated if omitted): + +* `BFGS()` +* `LBFGS()` +* `ConjugateGradient()` +* `GradientDescent()` +* `MomentumGradientDescent()` +* `AcceleratedGradientDescent()` + +Requires a function, a gradient, and a Hessian (cannot be omitted): + +* `Newton()` + +Box constrained minimization: + +* `Fminbox()` + +Special methods for univariate optimization: + +* `Brent()` +* `GoldenSection()` + +### General Options +In addition to the solver, you can alter the behavior of the Optim package by using the following keywords: + +* `x_tol`: What is the threshold for determining convergence in the input vector? Defaults to `1e-32`. +* `f_tol`: What is the threshold for determining convergence in the objective value? Defaults to `1e-32`. +* `g_tol`: What is the threshold for determining convergence in the gradient? Defaults to `1e-8`. For gradient free methods, this will control the main convergence tolerance, which is solver specific. +* `iterations`: How many iterations will run before the algorithm gives up? Defaults to `1_000`. +* `store_trace`: Should a trace of the optimization algorithm's state be stored? Defaults to `false`. +* `show_trace`: Should a trace of the optimization algorithm's state be shown on `STDOUT`? Defaults to `false`. +* `extended_trace`: Save additional information. Solver dependent. +* `autodiff`: When only an objective function is provided, use automatic differentiation to compute exact numerical gradients. If not, finite differencing will be used. This functionality is experimental. Defaults to `false`. +* `show_every`: Trace output is printed every `show_every`th iteration. + +We currently recommend the statically dispatched interface by using the `OptimizationOptions` +constructor: +```jl +res = optimize(f, g!, + [0.0, 0.0], + GradientDescent(), + OptimizationOptions(g_tol = 1e-12, + iterations = 10, + store_trace = true, + show_trace = false)) +``` +Another interface is also available, based directly on keywords: +```jl +res = optimize(f, g!, + [0.0, 0.0], + method = GradientDescent(), + g_tol = 1e-12, + iterations = 10, + store_trace = true, + show_trace = false) +``` +Notice the need to specify the method using a keyword if this syntax is used. It is likely that this will be deprecated in the future. diff --git a/docs/src/user/minimization.md b/docs/src/user/minimization.md new file mode 100644 index 000000000..9e1a0e0c7 --- /dev/null +++ b/docs/src/user/minimization.md @@ -0,0 +1,126 @@ +## Minimizing a function +To show how the Optim package can be used, we implement the +[Rosenbrock function](http://en.wikipedia.org/wiki/Rosenbrock_function), +a classic problem in numerical optimization. We'll assume that you've already +installed the Optim package using Julia's package manager. +First, we load Optim and define the Rosenbrock function: +```jl +using Optim +f(x) = (1.0 - x[1])^2 + 100.0 * (x[2] - x[1]^2)^2 +``` +Once we've defined this function, we can find the minimum of the Rosenbrock +function using any of our favorite optimization algorithms. With a function defined, +we just specify an initial point `x` and run: +```jl +optimize(f, [0.0, 0.0]) +``` +Optim will default to using the Nelder-Mead method in this case, as we did not provide a gradient. This can also +be explicitly specified using: +```jl +optimize(f, [0.0, 0.0], NelderMead()) +``` +Other solvers are available. Below, we use L-BFGS, a quasi-Newton method that requires a gradient. +If we pass `f` alone, Optim will construct an approximate gradient for us +using central finite differencing: +```jl +optimize(f, [0.0, 0.0], LBFGS()) +``` +Alternatively, the `autodiff` keyword will use atomatic differentiation to construct +the gradient. +```jl +optimize(f, [0.0, 0.0], LBFGS(), OptimizationOptions(autodiff = true)) +``` +For better performance and greater precision, you can pass your own gradient function. For the Rosenbrock example, the analytical gradient can be shown to be: +```jl +function g!(x::Vector, storage::Vector) +storage[1] = -2.0 * (1.0 - x[1]) - 400.0 * (x[2] - x[1]^2) * x[1] +storage[2] = 200.0 * (x[2] - x[1]^2) +end +``` +Note that the functions we're using to calculate the gradient (and later the Hessian `h!`) of the Rosenbrock function mutate a fixed-sized storage array, which is passed as an additional argument called `storage`. By mutating a single array over many iterations, this style of function definition removes the sometimes considerable costs associated with allocating a new array during each call to the `g!` or `h!` functions. You can use `Optim` without manually defining a gradient or Hessian function, but if you do define these functions, they must take these two arguments in this order. +Returning to our optimization, you simply pass `g!` together with `f` from before to use the gradient: +```jl +optimize(f, g!, [0.0, 0.0], LBFGS()) +``` +For some methods, like simulated annealing, the gradient will be ignored: +```jl +optimize(f, g!, [0.0, 0.0], SimulatedAnnealing()) +``` +In addition to providing gradients, you can provide a Hessian function `h!` as well. In our current case this is: +```jl +function h!(x::Vector, storage::Matrix) +storage[1, 1] = 2.0 - 400.0 * x[2] + 1200.0 * x[1]^2 +storage[1, 2] = -400.0 * x[1] +storage[2, 1] = -400.0 * x[1] +storage[2, 2] = 200.0 +end +``` +Now we can use Newton's method for optimization by running: +```jl +optimize(f, g!, h!, [0.0, 0.0]) +``` +Which defaults to `Newton()` since a Hessian was provided. Like gradients, the Hessian function will be ignored if you use a method that does not require it: +```jl +optimize(f, g!, h!, [0.0, 0.0], LBFGS()) +``` +Note that Optim will not generate approximate Hessians using finite differencing +because of the potentially low accuracy of approximations to the Hessians. Other +than Newton's method, none of the algorithms provided by the Optim package employ +exact Hessians. + +## Obtaining results +After we have our results in `res`, we can use the API for getting optimization results. +This consists of a collection of functions. They are not exported, so they have to be prefixed by `Optim.`. +Say we do the following optimization: +```jl +res = optimize(x->dot(x,[1 0. 0; 0 3 0; 0 0 1]*x), zeros(3)) +``` + If we can't remember what method we used, we simply use +```jl +Optim.method(res) +``` +which will return `"Nelder Mead"`. A bit more useful information is the minimizer and minimum of the objective functions, which can be found using +```jlcon +julia> Optim.minimizer(res) +3-element Array{Float64,1}: + -0.499921 + -0.3333 + -1.49994 + +julia> Optim.minimum(res) + -2.8333333205768865 +``` + +### Complete list of functions +A complete list of functions can be found below. + +Defined for all methods: + +* `method(res)` +* `minimizer(res)` +* `minimum(res)` +* `iterations(res)` +* `iteration_limit_reached(res)` +* `trace(res)` +* `x_trace(res)` +* `f_trace(res)` +* `f_calls(res)` +* `converged(res)` + +Defined for univariate optimization: + +* `lower_bound(res)` +* `upper_bound(res)` +* `x_lower_trace(res)` +* `x_upper_trace(res)` +* `rel_tol(res)` +* `abs_tol(res)` + +Defined for multivariate optimization: + +* `g_norm_trace(res)` +* `g_calls(res)` +* `x_converged(res)` +* `f_converged(res)` +* `g_converged(res)` +* `initial_state(res)` diff --git a/docs/src/user/planned.md b/docs/src/user/planned.md new file mode 100644 index 000000000..da7084792 --- /dev/null +++ b/docs/src/user/planned.md @@ -0,0 +1,22 @@ +## Planned Changes +Some features and changes have been identified as "wanted". Some of these are breaking +changes, but for the better. + +One important change is the ordering of the preallocated array, and the current +iterate in the mutating gradient and Hessian functions. Currently, we have +```julia +g!(x, stor) +h!(x, stor) +``` +But with the next version of Optim, we intend to be more in line with the rest +of the Julia ecosystem and write +```julia +g!(stor, x) +h!(stor, x) +``` + +Ít is also quite possible that the keywords-style tuning of options will be removed. +Instead of writing `optimize(..., g_tol = 1e-4)` users will have to write `optimize(..., OptimizationOptions(g_tol = 1e-4))`. +Obviously, it is a bit more verbose, but it will allow the internals to work entirely through +dispatch, and avoid a lot of keyword handling. Simpler code is easier to maintain, +and less prone to bugs. diff --git a/docs/src/user/tipsandtricks.md b/docs/src/user/tipsandtricks.md new file mode 100644 index 000000000..473a8c405 --- /dev/null +++ b/docs/src/user/tipsandtricks.md @@ -0,0 +1,148 @@ +## Dealing with constant parameters +In many applications, there may be factors that are relevant to the function evaluations, +but are fixed throughout the optimization. An obvious example is using data in a +likelihood function, but it could also be parameters we wish to hold constant. + +Consider a squared error loss function that depends on some data `x` and `y`, +and parameters `betas`. As far as the solver is concerned, there should only be one +input argument to the function we want to minimize, call it `sqerror`. + +The problem is that we want to optimize a function `sqerror` that really depends +on three inputs, and two of them are constant throught the optimization procedure. +To do this, we need to define the variables `x` and `y` +```jl +x = [1.0, 2.0, 3.0] +y = 1.0 + 2.0 * x + [-0.3, 0.3, -0.1] +``` +We then simply define a function in three variables +```julia +function sqerror(betas, X, Y) + err = 0.0 + for i in 1:length(X) + pred_i = betas[1] + betas[2] * X[i] + err += (Y[i] - pred_i)^2 + end + return err +end +``` +and then optimize the following anonymous function +```jl +res = optimize(b -> sqerror(b, x, y), [0.0, 0.0]) +``` +Alternatively, we can define a closure `sqerror(betas)` that is aware of the variables we +just defined +```jl +function sqerror(betas) + err = 0.0 + for i in 1:length(x) + pred_i = betas[1] + betas[2] * x[i] + err += (y[i] - pred_i)^2 + end + return err +end +``` +We can then optimize the `sqerror` function just like any other function +```jl +res = optimize(sqerror, [0.0, 0.0]) +``` + +## Avoid repeating computations +Say you are optimizing a function +```julia +f(x) = x[1]^2+x[2]^2 +g!(x, stor) = [2x[1], 2x[2]] +``` +In this situation, no calculations from `f` could be reused in `g!`. However, sometimes +there is a substantial similarity between the objective function, and gradient, and +some calculations can be reused. +The trick here is essentially the same as above. We use a closure or an anonymous function. +Basically, we define +```julia +function calculate_common!(x, last_x, buffer) + if x != last_x + copy!(last_x, x) + #do whatever common calculations and save to buffer + end +end + +function f(x, buffer, last_x) + calculate_common!(x, last_x, buffer) + f_body # depends on buffer +end + +function g!(x, stor, buffer, last_x) + calculate_common!(x, last_x, buffer) + g_body! # depends on buffer +end +``` +and then the following +```julia +using Optim +x0 = ... +buffer = Array(...) # Preallocate an appropriate buffer +last_x = similar(x0) +df = TwiceDifferentiableFunction(x -> f(x, buffer, x0), + (x, stor) -> g!(x, stor, buffer, last_x)) +optimize(df, x0) +``` +## Provide gradients +As mentioned in the general introduction, passing analytical gradients can have an +impact on performance. To show an example of this, consider the separable extension of the +Rosenbrock function in dimension 5000, see [SROSENBR](ftp://ftp.numerical.rl.ac.uk/pub/cutest/sif/SROSENBR.SIF) in CUTEst. + +Below, we use the gradients and objective functions from [mastsif](http://www.cuter.rl.ac.uk/Problems/mastsif.shtml) through [CUTEst.jl](https://github.com/JuliaOptimizers/CUTEst.jl). +We only show the first five iterations of an attempt to minimize the function using +Gradient Descent. +```jlcon +julia> @time optimize(f, x0, GradientDescent(), + OptimizationOptions(show_trace=true, iterations = 5)) +Iter Function value Gradient norm + 0 4.850000e+04 2.116000e+02 + 1 1.018734e+03 2.704951e+01 + 2 3.468449e+00 5.721261e-01 + 3 2.966899e+00 2.638790e-02 + 4 2.511859e+00 5.237768e-01 + 5 2.107853e+00 1.020287e-01 + 21.731129 seconds (1.61 M allocations: 63.434 MB, 0.03% gc time) +Results of Optimization Algorithm + * Algorithm: Gradient Descent + * Starting Point: [1.2,1.0, ...] + * Minimizer: [1.0287767703731154,1.058769439356144, ...] + * Minimum: 2.107853e+00 + * Iterations: 5 + * Convergence: false + * |x - x'| < 1.0e-32: false + * |f(x) - f(x')| / |f(x)| < 1.0e-32: false + * |g(x)| < 1.0e-08: false + * Reached Maximum Number of Iterations: true + * Objective Function Calls: 23 + * Gradient Calls: 23 + +julia> @time optimize(f, g!, x0, GradientDescent(), + OptimizationOptions(show_trace=true, iterations = 5)) +Iter Function value Gradient norm + 0 4.850000e+04 2.116000e+02 + 1 1.018769e+03 2.704998e+01 + 2 3.468488e+00 5.721481e-01 + 3 2.966900e+00 2.638792e-02 + 4 2.511828e+00 5.237919e-01 + 5 2.107802e+00 1.020415e-01 + 0.009889 seconds (915 allocations: 270.266 KB) +Results of Optimization Algorithm + * Algorithm: Gradient Descent + * Starting Point: [1.2,1.0, ...] + * Minimizer: [1.0287763814102757,1.05876866832087, ...] + * Minimum: 2.107802e+00 + * Iterations: 5 + * Convergence: false + * |x - x'| < 1.0e-32: false + * |f(x) - f(x')| / |f(x)| < 1.0e-32: false + * |g(x)| < 1.0e-08: false + * Reached Maximum Number of Iterations: true + * Objective Function Calls: 23 + * Gradient Calls: 23 +``` +The objective has obtained a value that is very similar between the two runs, but +the run with the analytical gradient is way faster. It is possible that the finite +differences code can be improved, but generally the optimization will be slowed down +by all the function evaluations required to do the central finite differences calculations.