New example PCA_stochastic to go with stochasticgradient

NicolasBoumal · Sep 6, 2017 · 2409b8d · 2409b8d
1 parent 7a17bd0
commit 2409b8d
Show file tree

Hide file tree

Showing 2 changed files with 129 additions and 43 deletions.
diff --git a/examples/PCA_stochastic.m b/examples/PCA_stochastic.m
@@ -0,0 +1,129 @@
+function [X, A] = PCA_stochastic(A, k)
+% Example of stochastic gradient algorithm in Manopt on a PCA problem.
+% 
+% PCA (principal component analysis) on a dataset A of size nxd consists
+% in solving
+% 
+%   minimize_X  f(X) = -.5*norm(A*X, 'fro')^2 / n,
+% 
+% where X is a matrix of dimension dxk with orthonormal columns. This
+% is equivalent to finding k dominant singular vectors of A, or k top
+% eigenvectors of A'*A.
+% 
+% If n is large, this computation can be expensive. Thus,  stochastic
+% gradient algorithms take the point of view that f(X) is a sum of many (n)
+% terms: each term involves only one of the n rows of A.
+%
+% To make progress, it may be sufficient to optimize with respect to a
+% subset of the terms at each iteration. This way, each individual
+% iteration can be very cheap. In particular, individual operations have
+% cost independent of n, because f or its gradient need never be evaluated
+% completely (or at all in the case of f.)
+%
+% Stochastic gradient algorithms (this implementation in particular) are
+% sensitive to proper parameter tuning. See in code.
+
+% This file is part of Manopt and is copyrighted. See the license file.
+% 
+% Main author: Bamdev Mishra and Nicolas Boumal, Sept. 6, 2017
+% Contributors:
+% 
+% Change log:
+% 
+
+
+    % If none is given, generate a random data set: n samples in R^d
+    if ~exist('A', 'var') || isempty(A)
+        d = 1000;
+        n = 100000;
+        fprintf('Generating data...');
+        A = randn(n, d)*diag([[15 10 5], ones(1, d-3)]);
+        fprintf(' done (size: %d x %d).\n', size(A));
+    else
+        [n, d] = size(A);
+    end
+
+    % Pick a number of component to compute
+    if ~exist('k', 'var') || isempty(k)
+        k = 3;
+    end
+
+    % We are looking for k orthonormal vectors in R^d: Stiefel manifold.
+    problem.M = stiefelfactory(d, k);
+
+    % The cost function to minimize is a sum of n terms. This parameter
+    % must be set for stochastic algorithms.
+    problem.ncostterms = n;
+
+    % We do not need to specify how to compute the value of the cost
+    % function (stochastic algorithms never use this). All we need is to
+    % specify how to compute the gradient of the cost function, where the
+    % sum is restricted to a subset of the terms (a sample). Notice that we
+    % specify a partial Euclidean gradient (hence the 'e' in partialegrad).
+    % This way, Manopt will automatically convert the Euclidean vector into
+    % a proper Riemannian partial gradient, in the tangent space at X.
+    % In particular, if sample = 1:n, then the partial gradient corresponds
+    % to the actual (complete) gradient.
+    problem.partialegrad = @partialegrad;
+    function G = partialegrad(X, sample)
+
+        % X is an orthonormal matrix of size dxk
+        % sample is a vector if indices between 1 and n: a subset
+        % Extract a subset of the dataset
+        Asample = A(sample, :);
+
+        % Compute the gradient of f restricted to that sample
+        G = -Asample'*(Asample*X);
+        G = G / n;
+
+    end
+
+    % If one wants to use checkgradient to verify one's work, then it is
+    % necessary to specify the cost function as well, as below.
+    % problem.cost = @(X) -.5*norm(A*X, 'fro')^2 / n;
+    % checkgradient(problem); pause;
+
+    % To have the solver record statistics every x iterations, set
+    % options.checkperiod to x. This will record simple quantities which
+    % are almost free to compute (namely, elapsed time and step size of the
+    % last step.) To record more sophisticated quantities, you can use
+    % options.statsfun as usual. Time spent computing these statistics is
+    % not counted in times reported in the info structure returned by the
+    % solver.
+    options.checkperiod = 10;
+    options.statsfun = statsfunhelper('metric', @(X) norm(A*X, 'fro'));
+
+    % Set the parameters for the solver: stochastic gradient algorithms
+    % tend to be quite sensitive to proper tuning, especially regarding
+    % step size selection. See the solver's documentation for details.
+    options.maxiter = 200;
+    options.batchsize = 10;
+    % options.stepsize_type = 'decay';
+    options.stepsize_init = 1e2;
+    options.stepsize_lambda = 1e-3;
+    options.verbosity = 2;
+
+    % Run the solver
+    [X, info] = stochasticgradient(problem, [], options);
+
+
+    % Plot the special metric recorded by options.statsfun
+    plot([info.iter], [info.metric], '.-');
+    xlabel('Iteration #');
+    ylabel('Frobenius norm of A*X');
+    title('Convergence of stochasticgradient on stiefelfactory for PCA');
+
+    % Add to that plot a reference: the globally optimal value attained if
+    % the true dominant singular vectors are computed.
+    fprintf('Running svds... ');
+    t = tic();
+    [V, ~] = svds(A', k);
+    fprintf('done: %g [s] (note: svd may be faster)\n', toc(t));
+    hold all;
+    bound = norm(A*V, 'fro');
+    plot([info.iter], bound*ones(size([info.iter])), '--');
+    hold off;
+
+    legend('Algorithm', 'SVD bound', 'Location', 'SouthEast');
+
+end
diff --git a/tests/bmtests/PCA_stochastic.m b/tests/bmtests/PCA_stochastic.m