From 0e8494f2afb1ef74f71f69dce7845862ef6fc3ab Mon Sep 17 00:00:00 2001 From: Juraj Szitas Date: Sat, 7 Oct 2023 02:53:25 +0200 Subject: [PATCH] Add conjugated gradient descent solver, add math utilities, some cleanup --- example.cpp | 44 +++--- nlsolver.h | 422 +++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 361 insertions(+), 105 deletions(-) diff --git a/example.cpp b/example.cpp index 7199874..d30111a 100644 --- a/example.cpp +++ b/example.cpp @@ -18,15 +18,19 @@ #include "nlsolver.h" // NOLINT // baseline 'tried and tested' solvers -using nlsolver::DESolver; -using nlsolver::GradientDescent; +using nlsolver::DE; +using nlsolver::NelderMead; using nlsolver::NelderMeadPSO; -using nlsolver::NelderMeadSolver; -using nlsolver::PSOSolver; -using nlsolver::SANNSolver; +using nlsolver::PSO; +using nlsolver::SANN; +using GDType = nlsolver::GradientStepType; +using nlsolver::GradientDescent; + +using nlsolver::ConjugatedGradientDescent; + +// RNG using nlsolver::rng::xorshift; using nlsolver::rng::xoshiro; -using GDType = nlsolver::GradientStepType; // experimental solvers @@ -48,7 +52,7 @@ void print_vector(std::vector &x) { // to use any C++ standard random number generator just pass in a generator // functor e.g. using Mersene Twister -#include +#include // NOLINT #include struct std_MT { private: @@ -68,7 +72,7 @@ int main() { Rosenbrock prob; std::cout << "Nelder-Mead: " << std::endl; // initialize solver - passing the functor - auto nm_solver = NelderMeadSolver(prob); + auto nm_solver = NelderMead(prob); // initialize function arguments std::vector nm_init = {2, 7}; auto nm_res = nm_solver.minimize(nm_init); @@ -84,7 +88,7 @@ int main() { const double t2 = (x[1] - x[0] * x[0]); return t1 * t1 + 100 * t2 * t2; }; - auto nm_solver_lambda = NelderMeadSolver(RosenbrockLambda); + auto nm_solver_lambda = NelderMead(RosenbrockLambda); // initialize function arguments nm_init = {2, 7}; // nm_init[0] = 2; @@ -103,7 +107,7 @@ int main() { xorshift gen; // again initialize solver, this time also with the RNG auto de_solver = - DESolver, double, DEStrat::best>(prob, gen); + DE, double, DEStrat::best>(prob, gen); std::vector de_init = {2, 7}; @@ -116,7 +120,7 @@ int main() { std_MT std_gen; // again initialize solver, this time also with the RNG // if strategy is not specifieed, defaults to random - auto de_solver_MT = DESolver(prob, std_gen); + auto de_solver_MT = DE(prob, std_gen); // reset initial state de_init[0] = 2; de_init[1] = 7; @@ -131,8 +135,7 @@ int main() { // add PSO Solver Type - defaults to random using nlsolver::PSOType; // again initialize solver, this time also with the RNG - auto pso_solver = - PSOSolver, double>(prob, xos_gen); + auto pso_solver = PSO, double>(prob, xos_gen); // set initial state - if no bounds are given, default initial parameters are // taken roughly as the scale of the parameter space std::vector pso_init = {3, 3}; @@ -156,8 +159,8 @@ int main() { // we also have an accelerated version - we reset the RNG as well. xos_gen.reset(); auto apso_solver = - PSOSolver, double, PSOType::Accelerated>( - prob, xos_gen); + PSO, double, PSOType::Accelerated>(prob, + xos_gen); // set initial state - if no bounds are given, default initial parameters are // taken roughly as the scale of the parameter space std::vector apso_init = {3, 3}; @@ -168,8 +171,7 @@ int main() { std::cout << "Simulated Annealing with xoshiro: " << std::endl; // we also have an accelerated version - we reset the RNG as well. xos_gen.reset(); - auto sann_solver = - SANNSolver, double>(prob, xos_gen); + auto sann_solver = SANN, double>(prob, xos_gen); // set initial state - if no bounds are given, default initial parameters are // taken roughly as the scale of the parameter space std::vector sann_init = {3, 3}; @@ -217,5 +219,13 @@ int main() { gd_res_bigstep.print(); print_vector(gd_init_bigstep); + std::cout << "Conjugated Gradient Descent (always requires linesearch)" + << std::endl; + auto cgd_solver = ConjugatedGradientDescent(prob); + std::vector cgd_init = {2, 2}; + auto cgd_res = cgd_solver.minimize(cgd_init); + cgd_res.print(); + print_vector(cgd_init); + return 0; } diff --git a/nlsolver.h b/nlsolver.h index 7d5b405..0fc73ca 100644 --- a/nlsolver.h +++ b/nlsolver.h @@ -38,6 +38,218 @@ #include #include +// mostly dot products and other fun stuff +namespace nlsolver::math { +template +inline T dot(const T *x, const T *y, int f) { + T s = 0; + for (int z = 0; z < f; z++) { + s += (*x) * (*y); + x++; + y++; + } + return s; +} + +template +inline T fast_sum(const T *x, int size) { + T result = 0; + for (int i = 0; i < size; i++) { + result += (*x); + x++; + } + return result; +} + +template +inline T vec_scalar_mult(const T *vec, const T *scalar, int f) { + T result = 0; + // load single scalar + // Don't forget the remaining values. + for (int i = 0; i < f; i++) { + result += *vec * *scalar; + vec++; + } + return result; +} + +// inspired by annoylib, see +// https://github.com/spotify/annoy/blob/main/src/annoylib.h +#if !defined(NO_MANUAL_VECTORIZATION) && defined(__GNUC__) && \ + (__GNUC__ > 6) && defined(__AVX512F__) +#define DOT_USE_AVX512 +#endif +#if !defined(NO_MANUAL_VECTORIZATION) && defined(__AVX__) && \ + defined(__SSE__) && defined(__SSE2__) && defined(__SSE3__) +#define DOT_USE_AVX +#endif + +#if defined(DOT_USE_AVX) || defined(DOT_USE_AVX512) +#if defined(_MSC_VER) +#include +#elif defined(__GNUC__) +#include +#endif +#endif + +#ifdef DOT_USE_AVX +// Horizontal single sum of 256bit vector. +inline float hsum256_ps_avx(__m256 v) { + const __m128 x128 = + _mm_add_ps(_mm256_extractf128_ps(v, 1), _mm256_castps256_ps128(v)); + const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128)); + const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); + return _mm_cvtss_f32(x32); +} + +inline double hsum256_pd_avx(__m256d v) { + __m128d vlow = _mm256_castpd256_pd128(v); + __m128d vhigh = _mm256_extractf128_pd(v, 1); // high 128 + vlow = _mm_add_pd(vlow, vhigh); // reduce down to 128 + __m128d high64 = _mm_unpackhi_pd(vlow, vlow); + return _mm_cvtsd_f64(_mm_add_sd(vlow, high64)); // reduce to scalar +} + +// overload +template <> +inline float dot(const float *x, const float *y, int f) { + float result = 0; + if (f > 7) { + __m256 d = _mm256_setzero_ps(); + for (; f > 7; f -= 8) { + d = _mm256_add_ps(d, + _mm256_mul_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y))); + x += 8; + y += 8; + } + // Sum all floats in dot register. + result += hsum256_ps_avx(d); + } + // Don't forget the remaining values. + for (; f > 0; f--) { + result += *x * *y; + x++; + y++; + } + return result; +} + +// second overload +template <> +inline double dot(const double *x, const double *y, int f) { + double result = 0; + if (f > 3) { + __m256d d = _mm256_setzero_pd(); + for (; f > 3; f -= 4) { + d = _mm256_add_pd(d, + _mm256_mul_pd(_mm256_loadu_pd(x), _mm256_loadu_pd(y))); + x += 4; + y += 4; + } + // Sum all floats in dot register. + result += hsum256_pd_avx(d); + } + // Don't forget the remaining values. + for (; f > 0; f--) { + result += *x * *y; + x++; + y++; + } + return result; +} + +template <> +inline float fast_sum(const float *x, int size) { + float result = 0; + if (size > 7) { + __m256 d = _mm256_setzero_ps(); + for (; size > 7; size -= 8) { + d = _mm256_add_ps(d, _mm256_loadu_ps(x)); + x += 8; + } + // Sum all floats in dot register. + result += hsum256_ps_avx(d); + } + // Don't forget the remaining values. + for (; size > 0; size--) { + result += *x; + x++; + } + return result; +} + +template <> +inline double fast_sum(const double *x, int size) { + double result = 0; + if (size > 3) { + __m256d d = _mm256_setzero_pd(); + for (; size > 3; size -= 4) { + d = _mm256_add_pd(d, _mm256_loadu_pd(x)); + x += 4; + } + // Sum all floats in dot register. + result += hsum256_pd_avx(d); + } + // Don't forget the remaining values. + for (; size > 0; size--) { + result += *x; + x++; + } + return result; +} + +// multiply a vector by scalar +template <> +inline float vec_scalar_mult(const float *vec, const float *scalar, + int f) { + float result = 0; + // load single scalar + const __m256 s = _mm256_set1_ps(*scalar); + if (f > 7) { + __m256 d = _mm256_setzero_ps(); + for (; f > 7; f -= 8) { + d = _mm256_add_ps(d, _mm256_mul_ps(_mm256_loadu_ps(vec), s)); + vec += 8; + } + // Sum all floats in dot register. + result += hsum256_ps_avx(d); + } + // Don't forget the remaining values. + for (; f > 0; f--) { + result += *vec * *scalar; + vec++; + } + return result; +} + +// overload for doubles +template <> +inline double vec_scalar_mult(const double *vec, const double *scalar, + int f) { + double result = 0; + // load single scalar + const __m256d s = _mm256_set1_pd(*scalar); + if (f > 3) { + __m256d d = _mm256_setzero_pd(); + for (; f > 3; f -= 4) { + d = _mm256_add_pd(d, _mm256_mul_pd(_mm256_loadu_pd(vec), s)); + vec += 8; + } + // Sum all floats in dot register. + result += hsum256_pd_avx(d); + } + // Don't forget the remaining values. + for (; f > 0; f--) { + result += *vec * *scalar; + vec++; + } + return result; +} + +#endif + +} // namespace nlsolver::math + namespace nlsolver::finite_difference { // The 'accuracy' can be 0, 1, 2, 3. template @@ -332,7 +544,7 @@ scalar_t cvsrch(Callable &f, std::vector &x, scalar_t current_f_value, if (dginit >= 0.0) { // There is no descent direction. // TODO(patwie): Handle this case. - return -1; + return stpmin; } bool brackt = false; @@ -435,8 +647,8 @@ scalar_t cvsrch(Callable &f, std::vector &x, scalar_t current_f_value, template [[nodiscard]] inline scalar_t line_at_alpha( Callable &f, std::vector &linesearch_temp, - const std::vector &x, - const std::vector &search_direction, const scalar_t alpha) { + std::vector &x, const std::vector &search_direction, + const scalar_t alpha) { const size_t x_size = x.size(); for (size_t i = 0; i < x_size; ++i) { linesearch_temp[i] = x[i] + alpha * search_direction[i]; @@ -445,8 +657,8 @@ template } template [[nodiscard]] [[maybe_unused]] scalar_t armijo_search( - Callable &f, const scalar_t current_f_val, const std::vector &x, - const std::vector &gradient, + Callable &f, const scalar_t current_f_val, std::vector &x, + std::vector &gradient, const std::vector &search_direction, scalar_t alpha = 1.0) { constexpr scalar_t c = 0.2, rho = 0.9; scalar_t limit = 0.0; @@ -469,8 +681,8 @@ template // this is just an overload which allows us to pass a temporary template [[maybe_unused]] scalar_t armijo_search( - Callable &f, const scalar_t current_f_val, const std::vector &x, - const std::vector &gradient, + Callable &f, const scalar_t current_f_val, std::vector &x, + std::vector &gradient, const std::vector &search_direction, std::vector &linesearch_temp, scalar_t alpha = 1.0) { constexpr scalar_t c = 0.2, rho = 0.9; @@ -491,8 +703,7 @@ template // this is just an overload which allows us to pass a temporary template [[maybe_unused]] scalar_t armijo_search( - Callable &f, const std::vector &x, - const std::vector &gradient, + Callable &f, std::vector &x, std::vector &gradient, const std::vector &search_direction, std::vector &linesearch_temp, scalar_t alpha = 1.0) { constexpr scalar_t c = 0.2, rho = 0.9; @@ -720,7 +931,7 @@ struct solver_status { }; template -class NelderMeadSolver { +class NelderMead { private: Callable &f; const scalar_t step, alpha, gamma, rho, sigma; @@ -730,7 +941,7 @@ class NelderMeadSolver { public: // constructor - explicit NelderMeadSolver( + explicit NelderMead( Callable &f, const scalar_t step = -1, const scalar_t alpha = 1, const scalar_t gamma = 2, const scalar_t rho = 0.5, const scalar_t sigma = 0.5, const scalar_t eps = 10e-4, @@ -996,7 +1207,7 @@ enum RecombinationStrategy { best, random }; template -class DESolver { +class DE { private: Callable &f; RNG &generator; @@ -1005,7 +1216,7 @@ class DESolver { public: // constructor - DESolver( + DE( Callable &f, RNG &generator, const scalar_t crossover_prob = 0.9, const scalar_t differential_weight = 0.8, const scalar_t eps = 10e-4, const size_t pop_size = 50, const size_t max_iter = 1000, @@ -1108,7 +1319,7 @@ enum PSOType { Vanilla, Accelerated }; template -class PSOSolver { +class PSO { private: // user supplied RNG &generator; @@ -1130,7 +1341,7 @@ class PSOSolver { const scalar_t eps; public: - explicit PSOSolver( + explicit PSO( Callable &f, RNG &generator, const scalar_t inertia = 0.8, const scalar_t cognitive_coef = 1.8, const scalar_t social_coef = 1.8, const size_t n_particles = 10, const size_t max_iter = 5000, @@ -1357,7 +1568,7 @@ class PSOSolver { }; template -class SANNSolver { +class SANN { private: // user supplied RNG &generator; @@ -1369,10 +1580,10 @@ class SANNSolver { const scalar_t temperature_max; public: - SANNSolver(Callable &f, RNG &generator, - const size_t max_iter = 5000, - const size_t temperature_iter = 10, - const scalar_t temperature_max = 10.0) + SANN(Callable &f, RNG &generator, + const size_t max_iter = 5000, + const size_t temperature_iter = 10, + const scalar_t temperature_max = 10.0) : generator(generator), f(f), f_evals(0), @@ -1928,11 +2139,12 @@ class GradientDescent { std::vector search_direction, linesearch_temp; public: - explicit GradientDescent( - Callable &f, Grad g = fin_diff(), - const size_t max_iter = 500, const scalar_t grad_eps = 1e-12, - const scalar_t alpha = 0.03) + explicit GradientDescent(Callable &f, + Grad g = fin_diff(), + const size_t max_iter = 500, + const scalar_t grad_eps = 1e-12, + const scalar_t alpha = 0.03) : f(f), g(g), max_iter(max_iter), grad_eps(grad_eps), alpha(alpha) {} // minimize interface solver_status minimize(std::vector &x) { @@ -1982,7 +2194,7 @@ class GradientDescent { for (size_t i = 0; i < n_dim; i++) { this->search_direction[i] = f_multiplier * gradient[i]; } - alpha_ = nlsolver::linesearch::more_thuente_search( + alpha_ = nlsolver::linesearch::armijo_search( f_lam, x, gradient, this->search_direction, this->linesearch_temp, this->alpha); } @@ -2016,6 +2228,101 @@ class GradientDescent { } } }; + +template > +class ConjugatedGradientDescent { + Callable &f; + Grad g; + const size_t max_iter; + const scalar_t grad_eps, alpha; + std::vector search_direction, linesearch_temp; + + public: + explicit ConjugatedGradientDescent( + Callable &f, Grad g = fin_diff(), + const size_t max_iter = 500, const scalar_t grad_eps = 1e-12, + const scalar_t alpha = 0.03) + : f(f), g(g), max_iter(max_iter), grad_eps(grad_eps), alpha(alpha) {} + // minimize interface + solver_status minimize(std::vector &x) { + return this->solve(x); + } + // maximize interface + solver_status maximize(std::vector &x) { + return this->solve(x); + } + + private: + template + solver_status solve(std::vector &x) { + const size_t n_dim = x.size(); + std::vector gradient = std::vector(n_dim, 0.0); + // we need additional temporaries for linesearch + this->search_direction = std::vector(n_dim, 0.0); + this->linesearch_temp = std::vector(n_dim, 0.0); + scalar_t alpha_ = this->alpha; + size_t iter = 0, function_calls_used = 0; + constexpr scalar_t f_multiplier = minimize ? -1.0 : 1.0; + scalar_t max_grad_norm = 0; + // construct lambda that takes f and enables function evaluation counting + auto f_lam = [&](decltype(x) &coef) { + function_calls_used++; + return this->f(coef); + }; + g(this->f, x, gradient); + if constexpr (std::is_same>::value) { + // if we are using the preset, we know how many function evaluations are + // necessary + function_calls_used += 6 * n_dim; + } + // set search direction for linesearch + for (size_t i = 0; i < n_dim; i++) { + this->search_direction[i] = f_multiplier * gradient[i]; + } + while (true) { + // compute gradient + const scalar_t grad_norm = norm(gradient); + if (iter >= this->max_iter || grad_norm < grad_eps || + std::isinf(grad_norm)) { + // evaluate at current parameters + scalar_t current_val = f_lam(x); + return solver_status(current_val, iter, function_calls_used); + } + alpha_ = nlsolver::linesearch::armijo_search( + f_lam, x, gradient, this->search_direction, this->linesearch_temp, + this->alpha); + // update parameters + for (size_t i = 0; i < n_dim; i++) { + x[i] += alpha_ * this->search_direction[i]; + } + // recompute gradient, compute new search direction using conjugation + // first, compute gradient.dot(gradient) with existing gradient, + // then compute new gradient and compute the same, then compute their + // ratio + scalar_t denominator = 0; + for (auto &val : gradient) denominator += std::pow(val, 2); + // recompute gradient + g(this->f, x, gradient); + if constexpr (std::is_same>::value) { + // if we are using the preset, we know how many function evaluations are + // necessary + function_calls_used += 6 * n_dim; + } + // figure out the numerator from new gradient + scalar_t numerator = 0; + for (auto &val : gradient) numerator += std::pow(val, 2); + const scalar_t search_update = numerator / denominator; + // update search direction + for (size_t i = 0; i < n_dim; ++i) { + this->search_direction[i] *= search_update; + // effectively beta * search_direction - gradient + this->search_direction[i] += f_multiplier * gradient[i]; + } + iter++; + } + } +}; }; // namespace nlsolver namespace nlsolver::experimental {}; @@ -2084,67 +2391,6 @@ class BFGS { iter++; } } - [[nodiscard]] bool check(const size_t iter, const scalar_t val, - const size_t max_iter) { - // template - // void Update(const function::State previous_function_state, - // const function::State current_function_state, - // const State &stop_state) { - if (std::isnan(val)) { - std::cout << "Nan loss at iteration: " << iter << std::endl; - return false; - } - if (iter >= max_iter) { - std::cout << "Iteration limit reached." << std::endl; - return false; - } - // f_delta = - // fabs(current_function_state.value - previous_function_state.value); - // x_delta = (current_function_state.x - previous_function_state.x) - // .template lpNorm(); - // gradient_norm = - // current_function_state.gradient.template lpNorm(); - // if ((stop_state.num_iterations > 0) && - // (num_iterations > stop_state.num_iterations)) { - // status = Status::IterationLimit; - // return; - // } - // if ((stop_state.x_delta > 0) && (x_delta < stop_state.x_delta)) { - // x_delta_violations++; - // if (x_delta_violations >= stop_state.x_delta_violations) { - // status = Status::XDeltaViolation; - // return; - // } - // } else { - // x_delta_violations = 0; - // } - // if ((stop_state.f_delta > 0) && (f_delta < stop_state.f_delta)) { - // f_delta_violations++; - // if (f_delta_violations >= stop_state.f_delta_violations) { - // status = Status::FDeltaViolation; - // return; - // } - // } else { - // f_delta_violations = 0; - // } - // if ((stop_state.gradient_norm > 0) && - // (gradient_norm < stop_state.gradient_norm)) { - // status = Status::GradientNormViolation; - // return; - // } - // if (previous_function_state.order == 2) { - // if ((stop_state.condition_hessian > 0) && - // (condition_hessian > stop_state.condition_hessian)) { - // status = Status::HessianConditionViolation; - // return; - // } - // } - // status = Status::Continue; - // } - return true; - } scalar_t step(std::vector &x) { // update search direction vector using -inverse_hessian * gradient for (size_t j = 0; j < this->dim; j++) { @@ -2168,7 +2414,7 @@ class BFGS { this->search_direction[i] = -this->gradient[i]; } } - const scalar_t rate = nlsolver::linesearch::more_thuente_search( + const scalar_t rate = nlsolver::linesearch::armijo_search( this->f, this->current_val, x, this->gradient, this->search_direction, this->linesearch_temp); // const scalar_t rate = linesearch::MoreThuente::Search(