From 36e60962714260dd0d25da3d963d1b1ef8f1da4e Mon Sep 17 00:00:00 2001 From: Danylo Ulianych Date: Wed, 31 Jul 2019 16:55:47 +0200 Subject: [PATCH] Integrated GPFA (#233) * Rename neural_trajectory to gpfa and refactor the codes accordingly * Remove codes related to the (not fully implemented) cross-validation feature * added tqdm as an extra requirement * push coverage only for requirements-extras test * gpfa verbose flag; added licence in the docs * Fixed and extended documentation, removed misleading function outputs. --- .travis.yml | 4 +- MANIFEST.in | 3 +- elephant/gpfa.py | 442 +++++++++++++++++++++ elephant/gpfa_src/INFO.md | 7 + elephant/gpfa_src/__init__.py | 0 elephant/gpfa_src/gpfa_core.py | 526 +++++++++++++++++++++++++ elephant/gpfa_src/gpfa_util.py | 596 +++++++++++++++++++++++++++++ elephant/spike_train_generation.py | 2 +- elephant/test/test_gpfa.py | 177 +++++++++ requirements-extras.txt | 1 + 10 files changed, 1753 insertions(+), 5 deletions(-) create mode 100644 elephant/gpfa.py create mode 100644 elephant/gpfa_src/INFO.md create mode 100644 elephant/gpfa_src/__init__.py create mode 100644 elephant/gpfa_src/gpfa_core.py create mode 100644 elephant/gpfa_src/gpfa_util.py create mode 100644 elephant/test/test_gpfa.py diff --git a/.travis.yml b/.travis.yml index c532810c1..20c71d025 100644 --- a/.travis.yml +++ b/.travis.yml @@ -35,6 +35,7 @@ matrix: before_install: sudo apt install -y libopenmpi-dev openmpi-bin before_script: pip install -r requirements-extras.txt script: mpiexec -n 1 nosetests --with-coverage --cover-package=elephant + after_success: coveralls || echo "coveralls failed" - name: "conda 3.7" python: 3.7 @@ -80,6 +81,3 @@ install: script: nosetests --with-coverage --cover-package=elephant - -after_success: - coveralls || echo "coveralls failed" diff --git a/MANIFEST.in b/MANIFEST.in index 7df4bcd1a..fe492b888 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,7 +5,8 @@ include LICENSE.txt include elephant/VERSION include elephant/current_source_density_src/README.md include elephant/current_source_density_src/test_data.mat -include elephant/spade_src/LICENSE +include elephant/neural_trajectory_src/README.md +include elephant/spade_src/LICENCE recursive-include elephant/spade_src *.so *.pyd include elephant/test/spike_extraction_test_data.txt recursive-include doc * diff --git a/elephant/gpfa.py b/elephant/gpfa.py new file mode 100644 index 000000000..9edcc2d33 --- /dev/null +++ b/elephant/gpfa.py @@ -0,0 +1,442 @@ +""" +Gaussian-process factor analysis (GPFA) is a dimensionality reduction method + [1] for neural trajectory visualization of parallel spike trains. + +The input consists of a set of trials (Y), each containing a list of spike +trains (N neurons). The output is the projection (X) of the data in space +of pre-chosen dimension x_dim < N. + +Under the assumption of a linear relation between the latent +variable X and the actual data Y in addition to a noise term (i.e., +Y = C * X + d + Gauss(0,R)), the projection corresponds to the conditional +probability E[X|Y]. + +A Gaussian process (X) of dimension x_dim < N is adopted to extract smooth +neural trajectories. The parameters (C, d, R) are estimated from the data using +factor analysis technique. GPFA is simply a set of Factor Analyzers (FA), +linked together in the low dimensional space by a Gaussian Process (GP). + +Internally, the analysis consists of the following steps: + +0) bin the data to get a sequence of N dimensional vectors for each time + bin (cf., `gpfa_util.get_seq`), and choose the reduced dimension x_dim + +1) expectation maximization for the parameters C, d, R and the time-scale of + the gaussian process, using all the trials provided as input (cf., + `gpfa_core.em`) + +2) projection of single trial in the low dimensional space (cf., + `gpfa_core.exact_inference_with_ll`) + +3) orthonormalization of the matrix C and the corresponding subspace: + (cf., `gpfa_util.orthogonalize`) + + +There are two principle scenarios of using the GPFA analysis. In the first +scenario, only one single dataset is available. The parameters that describe +the transformation are first extracted from the data, and the orthonormal basis +is constructed. Then the same data is projected into this basis. This analysis +is performed using the `gpfa()` function. + +In the second scenario, both a training and a test data set is available. Here, +the parameters are estimated from the training data. In a second step the test +data is projected into the non-orthonormal space obtained from the training +data, and then orthonormalized. From this scenario, it is possible to perform a +cross-validation between training and test data sets. This analysis is +performed by exectuing first `extract_trajectory()` on the training data, +followed by `postprocess()` on the training and test datasets. + + +References: + +The code was ported from the MATLAB code (see INFO.md for more details). + +[1] Yu MB, Cunningham JP, Santhanam G, Ryu SI, Shenoy K V, Sahani M (2009) +Gaussian-process factor analysis for low-dimensional single-trial analysis of +neural population activity. J Neurophysiol 102:614-635. + +:copyright: Copyright 2015-2019 by the Elephant team, see AUTHORS.txt. +:license: Modified BSD, see LICENSE.txt for details. +""" + + +import numpy as np +import neo +import quantities as pq + +from elephant.gpfa_src import gpfa_core, gpfa_util + + +def extract_trajectory(seqs, bin_size=20., x_dim=3, min_var_frac=0.01, + em_max_iters=500, verbose=False): + """ + Prepares data, estimates parameters of the latent variables and extracts + neural trajectories. + + Parameters + ---------- + seqs : np.recarray + data structure, whose nth entry (corresponding to the nth experimental + trial) has fields + * trialId: unique trial identifier + * T: (1 x 1) number of timesteps + * y: (yDim x T) neural data + bin_size : float, optional + Width of each time bin in ms (magnitude). + Default is 20 ms. + x_dim : int, optional + State dimensionality. + Default is 3. + min_var_frac : float, optional + fraction of overall data variance for each observed + dimension to set as the private variance floor. This is + used to combat Heywood cases, where ML parameter learning + returns one or more zero private variances. (default: 0.01) + (See Martin & McDonald, Psychometrika, Dec 1975.) + em_max_iters : int, optional + Number of EM iterations to run (default: 500). + verbose : bool, optional + specifies whether to display status messages (default: False) + + Returns + ------- + + parameter_estimates: dict + Estimated model parameters. + When the GPFA method is used, following parameters are contained + covType: {'rbf', 'tri', 'logexp'} + type of GP covariance. + Currently, only 'rbf' is supported. + gamma: ndarray of shape (1, #latent_vars) + related to GP timescales by 'bin_width / sqrt(gamma)' + eps: ndarray of shape (1, #latent_vars) + GP noise variances + d: ndarray of shape (#units, 1) + observation mean + C: ndarray of shape (#units, #latent_vars) + mapping between the neuronal data space and the latent variable + space + R: ndarray of shape (#units, #latent_vars) + observation noise covariance + + seqs_train: np.recarray + Contains the embedding of the training data into the latent variable + space. + Data structure, whose n-th entry (corresponding to the n-th + experimental trial) has fields + * trialId: int + unique trial identifier + * T: int + number of timesteps + * y: ndarray of shape (#units, #bins) + neural data + * xsm: ndarray of shape (#latent_vars, #bins) + posterior mean of latent variables at each time bin + * Vsm: ndarray of shape (#latent_vars, #latent_vars, #bins) + posterior covariance between latent variables at each + timepoint + * VsmGP: ndarray of shape (#bins, #bins, #latent_vars) + posterior covariance over time for each latent variable + + fit_info: dict + Information of the fitting process and the parameters used there: + * iteration_time: A list containing the runtime for each iteration + step in the EM algorithm. + * log_likelihood: float, maximized likelihood obtained in the + E-step of the EM algorithm. + * bin_size: int, Width of the bins. + * cvf: int, number for cross-validation folding + Default is 0 (no cross-validation). + * has_spikes_bool: Indicates if a neuron has any spikes across + trials. + * method: str, Method name. + + Raises + ------ + ValueError + If `seqs` is empty. + + """ + if len(seqs) == 0: + raise ValueError("Got empty trials.") + + # Set cross-validation folds + num_trials = len(seqs) + + test_mask = np.full(num_trials, False, dtype=bool) + train_mask = ~test_mask + + tr = np.arange(num_trials, dtype=np.int) + train_trial_idx = tr[train_mask] + test_trial_idx = tr[test_mask] + seqs_train = seqs[train_trial_idx] + seqs_test = seqs[test_trial_idx] + + # Remove inactive units based on training set + has_spikes_bool = (np.hstack(seqs_train['y']).mean(1) != 0) + + for seq_train in seqs_train: + seq_train['y'] = seq_train['y'][has_spikes_bool, :] + for seq_test in seqs_test: + seq_test['y'] = seq_test['y'][has_spikes_bool, :] + + # Check if training data covariance is full rank + y_all = np.hstack(seqs_train['y']) + y_dim = y_all.shape[0] + + if np.linalg.matrix_rank(np.cov(y_all)) < y_dim: + errmesg = 'Observation covariance matrix is rank deficient.\n' \ + 'Possible causes: ' \ + 'repeated units, not enough observations.' + raise ValueError(errmesg) + + if verbose: + print('Number of training trials: {}'.format(len(seqs_train))) + print('Number of test trials: {}'.format(len(seqs_test))) + print('Latent space dimensionality: {}'.format(x_dim)) + print('Observation dimensionality: {}'.format(has_spikes_bool.sum())) + + # The following does the heavy lifting. + params_est, seqs_train, fit_info = gpfa_core.gpfa_engine( + seq_train=seqs_train, + seq_test=seqs_test, + x_dim=x_dim, + bin_width=bin_size, + min_var_frac=min_var_frac, + em_max_iters=em_max_iters, + verbose=verbose) + + fit_info['has_spikes_bool'] = has_spikes_bool + fit_info['min_var_frac'] = min_var_frac + fit_info['bin_size'] = bin_size + + return params_est, seqs_train, fit_info + + +def postprocess(params_est, seqs_train, seqs_test=None): + """ + Orthonormalization and other cleanup. + + Parameters + ---------- + + params_est : dict + First return value of extract_trajectory() on the training data set. + Estimated model parameters. + When the GPFA method is used, following parameters are contained + covType: {'rbf', 'tri', 'logexp'} + type of GP covariance + Currently, only 'rbf' is supported. + gamma: ndarray of shape (1, #latent_vars) + related to GP timescales by 'bin_width / sqrt(gamma)' + eps: ndarray of shape (1, #latent_vars) + GP noise variances + d: ndarray of shape (#units, 1) + observation mean + C: ndarray of shape (#units, #latent_vars) + mapping between the neuronal data space and the latent variable + space + R: ndarray of shape (#units, #latent_vars) + observation noise covariance + + seqs_train: np.recarray + Contains the embedding of the training data into the latent variable + space. + Data structure, whose n-th entry (corresponding to the n-th + experimental trial) has fields + * trialId: int + unique trial identifier + * T: int + number of timesteps + * y: ndarray of shape (#units, #bins) + neural data + * xsm: ndarray of shape (#latent_vars, #bins) + posterior mean of latent variables at each time bin + * Vsm: ndarray of shape (#latent_vars, #latent_vars, #bins) + posterior covariance between latent variables at each + timepoint + * VsmGP: ndarray of shape (#bins, #bins, #latent_vars) + posterior covariance over time for each latent variable + + seqs_test: np.recarray, optional + Contains the embedding of the test data into the latent variable space. + Same data structure as for `seqs_train`. + Default is None. + + Returns + ------- + + params_est : dict + Estimated model parameters, including `Corth`, obtained by + orthonormalizing the columns of C. + seqs_train : np.recarray + Training data structure that contains the new field `xorth`, + the orthonormalized neural trajectories. + seqs_test : np.recarray + Test data structure that contains orthonormalized neural + trajectories in `xorth`, obtained using `params_est`. + When no test dataset is given, None is returned. + + + Raises + ------ + ValueError + If `fit_info['kernSDList'] != kern_sd`. + + """ + C = params_est['C'] + X = np.hstack(seqs_train['xsm']) + Xorth, Corth, _ = gpfa_util.orthogonalize(X, C) + seqs_train = gpfa_util.segment_by_trial(seqs_train, Xorth, 'xorth') + + params_est['Corth'] = Corth + + if seqs_test is not None: + print("Extracting neural trajectories for test data...\n") + + # Copy additional fields from extract_trajectory from training data to + # test data + seqs_test_dup = seqs_train.copy() + seqs_test_dup["T"] = seqs_test["T"] + seqs_test_dup["y"] = seqs_test["y"] + seqs_test_dup["trialId"] = seqs_test["trialId"] + + seqs_test = seqs_test_dup + seqs_test = gpfa_core.exact_inference_with_ll(seqs_test, params_est) + X = np.hstack(seqs_test['xsm']) + Xorth, Corth, _ = gpfa_util.orthogonalize(X, C) + seqs_test = gpfa_util.segment_by_trial(seqs_test, Xorth, 'xorth') + + return params_est, seqs_train, seqs_test + + +def gpfa(data, bin_size=20*pq.ms, x_dim=3, em_max_iters=500): + """ + Prepares data and calls functions for extracting neural trajectories in the + orthonormal space. + + The function combines calls to `extract_trajectory()` and `postprocess()` + in a scenario, where no cross-validation of the embedding is performed. + + Parameters + ---------- + + data : list of list of Spiketrain objects + The outer list corresponds to trials and the inner list corresponds to + the neurons recorded in that trial, such that data[l][n] is the + Spiketrain of neuron n in trial l. Note that the number and order of + Spiketrains objects per trial must be fixed such that data[l][n] and + data[k][n] refer to the same spike generator for any choice of l,k and + n. + bin_size : quantities.Quantity, optional + Width of each time bin. + Default is 20 ms. + x_dim : int, optional + State dimensionality. + Default is 3. + em_max_iters : int, optional + Number of EM iterations to run (default: 500). + + Returns + ------- + + parameter_estimates: dict + Estimated model parameters. + When the GPFA method is used, following parameters are contained + covType: {'rbf', 'tri', 'logexp'} + type of GP covariance + Currently, only 'rbf' is supported. + gamma: ndarray of shape (1, #latent_vars) + related to GP timescales by 'bin_width / sqrt(gamma)' + eps: ndarray of shape (1, #latent_vars) + GP noise variances + d: ndarray of shape (#units, 1) + observation mean + C: ndarray of shape (#units, #latent_vars) + mapping between the neuronal data space and the latent variable + space + Corth: ndarray of shape (#units, #latent_vars) + mapping between the neuronal data space and the orthonormal + latent variable space + R: ndarray of shape (#units, #latent_vars) + observation noise covariance + + seqs_train: numpy.recarray + Contains the embedding of the data into the latent variable space. + Data structure, whose n-th entry (corresponding to the n-th + experimental trial) has fields + * trialId: int + unique trial identifier + * T: int + number of timesteps + * y: ndarray of shape (#units, #bins) + neural data + * xsm: ndarray of shape (#latent_vars, #bins) + posterior mean of latent variables at each time bin + * Vsm: ndarray of shape (#latent_vars, #latent_vars, #bins) + posterior covariance between latent variables at each + timepoint + * VsmGP: ndarray of shape (#bins, #bins, #latent_vars) + posterior covariance over time for each latent variable + * xorth: ndarray of shape (#latent_vars, #bins) + trajectory in the orthonormalized space + + fit_info: dict + Information of the fitting process and the parameters used there: + * iteration_time: A list containing the runtime for each iteration + step in the EM algorithm. + * log_likelihood: float, maximized likelihood obtained in the + E-step of the EM algorithm. + * bin_size: int, Width of the bins. + * cvf: int, number for cross-validation folding + Default is 0 (no cross-validation). + * has_spikes_bool: Indicates if a neuron has any spikes across + trials. + * method: str, Method name. + + Raises + ------ + ValueError + If `data` is an empty list. + If `bin_size` if not a `pq.Quantity`. + If `data[0][1][0]` is not a `neo.SpikeTrain`. + + Examples + -------- + In the following example, we calculate the neural trajectories of 20 + Poisson spike train generators recorded in 50 trials with randomized + rates up to 100 Hz. + + >>> import numpy as np + >>> import quantities as pq + >>> from elephant.gpfa import gpfa + >>> from elephant.spike_train_generation import homogeneous_poisson_process + >>> data = [] + >>> for trial in range(50): + >>> n_channels = 20 + >>> firing_rates = np.random.randint(low=1, high=100, + >>> size=n_channels) * pq.Hz + >>> spike_times = [homogeneous_poisson_process(rate=rate) + >>> for rate in firing_rates] + >>> data.append((trial, spike_times)) + >>> params_est, seqs_train, seqs_test, fit_info = gpfa( + >>> data, bin_size=20 * pq.ms, x_dim=8) + + """ + # todo does it makes sense to explicitly pass trial_id? + if len(data) == 0: + raise ValueError("`data` cannot be empty") + if not isinstance(bin_size, pq.Quantity): + raise ValueError("'bin_size' must be of type pq.Quantity") + if not isinstance(data[0][1][0], neo.SpikeTrain): + raise ValueError("structure of the data is not correct: 0-axis " + "should be trials, 1-axis neo spike trains " + "and 2-axis spike times") + + seqs = gpfa_util.get_seq(data, bin_size) + params_est, seqs_train, fit_info = extract_trajectory( + seqs, bin_size=bin_size.rescale('ms').magnitude, x_dim=x_dim, + em_max_iters=em_max_iters) + params_est, seqs_train, _ = postprocess(params_est, seqs_train) + + return params_est, seqs_train, fit_info diff --git a/elephant/gpfa_src/INFO.md b/elephant/gpfa_src/INFO.md new file mode 100644 index 000000000..84bd5a95e --- /dev/null +++ b/elephant/gpfa_src/INFO.md @@ -0,0 +1,7 @@ +# GPFA +Gaussian process factor analysis (GPFA), based on Byron Yu's MATLAB implementation + +The original MATLAB code is available at Byron Yu's website: +https://users.ece.cmu.edu/~byronyu/software.shtml + +Extra requirements: scikit-learn, tqdm \ No newline at end of file diff --git a/elephant/gpfa_src/__init__.py b/elephant/gpfa_src/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/elephant/gpfa_src/gpfa_core.py b/elephant/gpfa_src/gpfa_core.py new file mode 100644 index 000000000..6d3eee0ec --- /dev/null +++ b/elephant/gpfa_src/gpfa_core.py @@ -0,0 +1,526 @@ +# -*- coding: utf-8 -*- +""" +GPFA core functionality. + +:copyright: Copyright 2015-2019 by the Elephant team, see AUTHORS.txt. +:license: Modified BSD, see LICENSE.txt for details. +""" + + +import time +import warnings + +import numpy as np +import scipy.linalg as linalg +import scipy.optimize as optimize +import scipy.sparse as sparse +from sklearn.decomposition import FactorAnalysis +from tqdm import trange + +from . import gpfa_util + + +def learn_gp_params(seq, params, verbose=False): + """Updates parameters of GP state model, given neural trajectories. + + Parameters + ---------- + seq : np.recarray + data structure containing neural trajectories; + params : dict + current GP state model parameters, which gives starting point + for gradient optimization; + verbose : bool, optional + specifies whether to display status messages (default: False) + + Returns + ------- + param_opt : np.ndarray + updated GP state model parameter + + Raises + ------ + ValueError + If `params['covType'] != 'rbf'`. + If `params['notes']['learnGPNoise']` set to True. + + """ + if params['covType'] != 'rbf': + raise ValueError("Only 'rbf' GP covariance type is supported.") + if params['notes']['learnGPNoise']: + raise ValueError("learnGPNoise is not supported.") + param_name = 'gamma' + fname = 'gpfa_util.grad_betgam' + + param_init = params[param_name] + param_opt = {param_name: np.empty_like(param_init)} + + x_dim = param_init.shape[-1] + precomp = gpfa_util.make_precomp(seq, x_dim) + + # Loop once for each state dimension (each GP) + for i in range(x_dim): + const = {'eps': params['eps'][i]} + initp = np.log(param_init[i]) + res_opt = optimize.minimize(eval(fname), initp, + args=(precomp[i], const), + method='L-BFGS-B', jac=True) + param_opt['gamma'][i] = np.exp(res_opt.x) + + if verbose: + print('\n Converged p; xDim:{}, p:{}'.format(i, res_opt.x)) + + return param_opt + + +def exact_inference_with_ll(seq, params, get_ll=True): + """ + Extracts latent trajectories from neural data, given GPFA model parameters. + + Parameters + ---------- + seq : np.recarray + Input data structure, whose n-th element (corresponding to the n-th + experimental trial) has fields: + y : ndarray of shape (#units, #bins) + neural data + T : int + number of bins + params : dict + GPFA model parameters whe the following fields: + C : ndarray + FA factor loadings matrix + d : ndarray + FA mean vector + R : ndarray + FA noise covariance matrix + gamma : ndarray + GP timescale + eps : ndarray + GP noise variance + get_ll : bool, optional + specifies whether to compute data log likelihood (default: True) + + Returns + ------- + seq_lat : np.recarray + a copy of the input data structure, augmented with the new + fields: + xsm : ndarray of shape (#latent_vars x #bins) + posterior mean of latent variables at each time bin + Vsm : ndarray of shape (#latent_vars, #latent_vars, #bins) + posterior covariance between latent variables at each + timepoint + VsmGP : ndarray of shape (#bins, #bins, #latent_vars) + posterior covariance over time for each latent + variable + ll : float + data log likelihood, np.nan is returned when `get_ll` is set False + """ + y_dim, x_dim = params['C'].shape + + # copy the contents of the input data structure to output structure + dtype_out = [(x, seq[x].dtype) for x in seq.dtype.names] + dtype_out.extend([('xsm', np.object), ('Vsm', np.object), + ('VsmGP', np.object)]) + seq_lat = np.empty(len(seq), dtype=dtype_out) + for dtype_name in seq.dtype.names: + seq_lat[dtype_name] = seq[dtype_name] + + # Precomputations + if params['notes']['RforceDiagonal']: + rinv = np.diag(1.0 / np.diag(params['R'])) + logdet_r = (np.log(np.diag(params['R']))).sum() + else: + rinv = linalg.inv(params['R']) + rinv = (rinv + rinv.T) / 2 # ensure symmetry + logdet_r = gpfa_util.logdet(params['R']) + + c_rinv = params['C'].T.dot(rinv) + c_rinv_c = c_rinv.dot(params['C']) + + t_all = seq_lat['T'] + t_uniq = np.unique(t_all) + ll = 0. + + # Overview: + # - Outer loop on each element of Tu. + # - For each element of Tu, find all trials with that length. + # - Do inference and LL computation for all those trials together. + for t in t_uniq: + k_big, k_big_inv, logdet_k_big = gpfa_util.make_k_big(params, t) + k_big = sparse.csr_matrix(k_big) + + blah = [c_rinv_c for _ in range(t)] + c_rinv_c_big = linalg.block_diag(*blah) # (xDim*T) x (xDim*T) + minv, logdet_m = gpfa_util.inv_persymm(k_big_inv + c_rinv_c_big, x_dim) + + # Note that posterior covariance does not depend on observations, + # so can compute once for all trials with same T. + # xDim x xDim posterior covariance for each timepoint + vsm = np.full((x_dim, x_dim, t), np.nan) + idx = np.arange(0, x_dim * t + 1, x_dim) + for i in range(t): + vsm[:, :, i] = minv[idx[i]:idx[i + 1], idx[i]:idx[i + 1]] + + # T x T posterior covariance for each GP + vsm_gp = np.full((t, t, x_dim), np.nan) + for i in range(x_dim): + vsm_gp[:, :, i] = minv[i::x_dim, i::x_dim] + + # Process all trials with length T + n_list = np.where(t_all == t)[0] + # dif is yDim x sum(T) + dif = np.hstack(seq_lat[n_list]['y']) - params['d'][:, np.newaxis] + # term1Mat is (xDim*T) x length(nList) + term1_mat = c_rinv.dot(dif).reshape((x_dim * t, -1), order='F') + + # Compute blkProd = CRinvC_big * invM efficiently + # blkProd is block persymmetric, so just compute top half + t_half = np.int(np.ceil(t / 2.0)) + blk_prod = np.zeros((x_dim * t_half, x_dim * t)) + idx = range(0, x_dim * t_half + 1, x_dim) + for i in range(t_half): + blk_prod[idx[i]:idx[i + 1], :] = c_rinv_c.dot( + minv[idx[i]:idx[i + 1], :]) + blk_prod = k_big[:x_dim * t_half, :].dot( + gpfa_util.fill_persymm(np.eye(x_dim * t_half, x_dim * t) - + blk_prod, x_dim, t)) + # xsmMat is (xDim*T) x length(nList) + xsm_mat = gpfa_util.fill_persymm(blk_prod, x_dim, t).dot(term1_mat) + + for i, n in enumerate(n_list): + seq_lat[n]['xsm'] = xsm_mat[:, i].reshape((x_dim, t), order='F') + seq_lat[n]['Vsm'] = vsm + seq_lat[n]['VsmGP'] = vsm_gp + + if get_ll: + # Compute data likelihood + val = -t * logdet_r - logdet_k_big - logdet_m \ + - y_dim * t * np.log(2 * np.pi) + ll = ll + len(n_list) * val - (rinv.dot(dif) * dif).sum() \ + + (term1_mat.T.dot(minv) * term1_mat.T).sum() + + if get_ll: + ll /= 2 + else: + ll = np.nan + + return seq_lat, ll + + +def em(params_init, seq, em_max_iters=500, tol=1.0E-8, min_var_frac=0.01, + freq_ll=5, verbose=False): + """ + Fits GPFA model parameters using expectation-maximization (EM) algorithm. + + Parameters + ---------- + params_init : dict + GPFA model parameters at which EM algorithm is initialized + covType : {'rbf', 'tri', 'logexp'} + type of GP covariance + gamma : ndarray of shape (1, #latent_vars) + related to GP timescales by + 'bin_width / sqrt(gamma)' + eps : ndarray of shape (1, #latent_vars) + GP noise variances + d : ndarray of shape (#units, 1) + observation mean + C : ndarray of shape (#units, #latent_vars) + mapping between the neuronal data space and the + latent variable space + R : ndarray of shape (#units, #latent_vars) + observation noise covariance + seq : np.recarray + training data structure, whose n-th entry (corresponding to the n-th + experimental trial) has fields + trialId : + unique trial identifier + T : int + number of bins + y : ndarray (yDim x T) + neural data + em_max_iters : int, optional + number of EM iterations to run (default: 500) + tol : float, optional + stopping criterion for EM (default: 1e-8) + min_var_frac : float, optional + fraction of overall data variance for each observed + dimension to set as the private variance floor. This is + used to combat Heywood cases, where ML parameter learning + returns one or more zero private variances. (default: 0.01) + (See Martin & McDonald, Psychometrika, Dec 1975.) + freq_ll : int, optional + data likelihood is computed at every freq_ll EM iterations. + freq_ll = 1 means that data likelihood is computed at every + iteration. (default: 5) + verbose : bool, optional + specifies whether to display status messages (default: false) + + Returns + ------- + params_est : dict + GPFA model parameter estimates, returned by EM algorithm (same + format as params_init) + seq_lat : dict + a copy of the training data structure, augmented with the new + fields: + xsm : ndarray of shape (#latent_vars x #bins) + posterior mean of latent variables at each time bin + Vsm : ndarray of shape (#latent_vars, #latent_vars, #bins) + posterior covariance between latent variables at each + timepoint + VsmGP : ndarray of shape (#bins, #bins, #latent_vars) + posterior covariance over time for each latent + variable + ll : list + list of log likelihoods after each EM iteration + iter_time : list + lisf of computation times (in seconds) for each EM iteration + """ + params = params_init + t = seq['T'] + y_dim, x_dim = params['C'].shape + lls = [] + ll_old = ll_base = ll = 0.0 + iter_time = [] + var_floor = min_var_frac * np.diag(np.cov(np.hstack(seq['y']))) + seq_lat = None + + # Loop once for each iteration of EM algorithm + for iter_id in trange(1, em_max_iters + 1, desc='EM iteration'): + if verbose: + print() + tic = time.time() + + if (np.fmod(iter_id, freq_ll) == 0) or (iter_id <= 2): + get_ll = True + else: + get_ll = False + + # ==== E STEP ===== + if not np.isnan(ll): + ll_old = ll + seq_lat, ll = exact_inference_with_ll(seq, params, get_ll=get_ll) + lls.append(ll) + + # ==== M STEP ==== + sum_p_auto = np.zeros((x_dim, x_dim)) + for seq_lat_n in seq_lat: + sum_p_auto += seq_lat_n['Vsm'].sum(axis=2) \ + + seq_lat_n['xsm'].dot(seq_lat_n['xsm'].T) + y = np.hstack(seq['y']) + xsm = np.hstack(seq_lat['xsm']) + sum_yxtrans = y.dot(xsm.T) + sum_xall = xsm.sum(axis=1)[:, np.newaxis] + sum_yall = y.sum(axis=1)[:, np.newaxis] + + # term is (xDim+1) x (xDim+1) + term = np.vstack([np.hstack([sum_p_auto, sum_xall]), + np.hstack([sum_xall.T, t.sum().reshape((1, 1))])]) + # yDim x (xDim+1) + cd = gpfa_util.rdiv(np.hstack([sum_yxtrans, sum_yall]), term) + + params['C'] = cd[:, :x_dim] + params['d'] = cd[:, -1] + + # yCent must be based on the new d + # yCent = bsxfun(@minus, [seq.y], currentParams.d); + # R = (yCent * yCent' - (yCent * [seq.xsm]') * currentParams.C') + # / sum(T); + c = params['C'] + d = params['d'][:, np.newaxis] + if params['notes']['RforceDiagonal']: + sum_yytrans = (y * y).sum(axis=1)[:, np.newaxis] + yd = sum_yall * d + term = ((sum_yxtrans - d.dot(sum_xall.T)) * c).sum(axis=1) + term = term[:, np.newaxis] + r = d ** 2 + (sum_yytrans - 2 * yd - term) / t.sum() + + # Set minimum private variance + r = np.maximum(var_floor, r) + params['R'] = np.diag(r[:, 0]) + else: + sum_yytrans = y.dot(y.T) + yd = sum_yall.dot(d.T) + term = (sum_yxtrans - d.dot(sum_xall.T)).dot(c.T) + r = d.dot(d.T) + (sum_yytrans - yd - yd.T - term) / t.sum() + + params['R'] = (r + r.T) / 2 # ensure symmetry + + if params['notes']['learnKernelParams']: + res = learn_gp_params(seq_lat, params, verbose=verbose) + params['gamma'] = res['gamma'] + + t_end = time.time() - tic + iter_time.append(t_end) + + # Verify that likelihood is growing monotonically + if iter_id <= 2: + ll_base = ll + elif verbose and ll < ll_old: + print('\nError: Data likelihood has decreased ', + 'from {0} to {1}'.format(ll_old, ll)) + elif (ll - ll_base) < (1 + tol) * (ll_old - ll_base): + break + + if len(lls) < em_max_iters: + print('Fitting has converged after {0} EM iterations.)'.format( + len(lls))) + + if np.any(np.diag(params['R']) == var_floor): + warnings.warn('Private variance floor used for one or more observed ' + 'dimensions in GPFA.') + + return params, seq_lat, lls, iter_time + + +def gpfa_engine(seq_train, seq_test, x_dim=8, bin_width=20.0, tau_init=100.0, + eps_init=1.0E-3, min_var_frac=0.01, em_max_iters=500, + verbose=False): + """Extract neural trajectories using GPFA. + + Parameters + ---------- + seq_train : np.recarray + training data structure, whose n-th element (corresponding to + the n-th experimental trial) has fields + trialId : + unique trial identifier + T : int + number of bins + y : ndarray of shape (#units, #bins) + neural data + seq_test : np.recarray + test data structure (same format as seqTrain) + x_dim : int, optional + state dimensionality (default: 3) + bin_width : float, optional + spike bin width in msec (default: 20) + tau_init : float, optional + GP timescale initialization in msec (default: 100) + eps_init : float, optional + GP noise variance initialization (default: 1e-3) + min_var_frac : float, optional + fraction of overall data variance for each observed + dimension to set as the private variance floor. This is + used to combat Heywood cases, where ML parameter learning + returns one or more zero private variances. (default: 0.01) + (See Martin & McDonald, Psychometrika, Dec 1975.) + em_max_iters : int, optional + number of EM iterations to run (default: 500) + verbose : bool, optional + specifies whether to display status messages (default: False) + + Returns + ------- + + parameter_estimates: dict + Estimated model parameters. + When the GPFA method is used, following parameters are contained + covType: {'rbf', 'tri', 'logexp'} + type of GP covariance + gamma: ndarray of shape (1, #latent_vars) + related to GP timescales by 'bin_width / sqrt(gamma)' + eps: ndarray of shape (1, #latent_vars) + GP noise variances + d: ndarray of shape (#units, 1) + observation mean + C: ndarray of shape (#units, #latent_vars) + mapping between the neuronal data space and the latent variable + space + R: ndarray of shape (#units, #latent_vars) + observation noise covariance + + seqs_train: np.recarray + Data structure, whose n-th entry (corresponding to the n-th + experimental trial) has fields + * trialId: int + unique trial identifier + * T: int + number of timesteps + * y: ndarray of shape (#units, #bins) + neural data + * xsm: ndarray of shape (#latent_vars, #bins) + posterior mean of latent variables at each time bin + * Vsm: ndarray of shape (#latent_vars, #latent_vars, #bins) + posterior covariance between latent variables at each + timepoint + * VsmGP: ndarray of shape (#bins, #bins, #latent_vars) + posterior covariance over time for each latent variable + + fit_info: dict + Information of the fitting process and the parameters used there: + * iteration_time: A list containing the runtime for each iteration + step in the EM algorithm. + * log_likelihood: float, maximized likelihood obtained in the + E-step of the EM algorithm. + * bin_size: int, Width of the bins. + * cvf: int, number for cross-validation folding + Default is 0 (no cross-validation). + * has_spikes_bool: Indicates if a neuron has any spikes across + trials. + * method: str, Method name. + """ + # For compute efficiency, train on equal-length segments of trials + seq_train_cut = gpfa_util.cut_trials(seq_train) + if len(seq_train_cut) == 0: + warnings.warn('No segments extracted for training. Defaulting to ' + 'segLength=Inf.') + seq_train_cut = gpfa_util.cut_trials(seq_train, seg_length=np.inf) + + # ================================== + # Initialize state model parameters + # ================================== + params_init = dict() + params_init['covType'] = 'rbf' + # GP timescale + # Assume binWidth is the time step size. + params_init['gamma'] = (bin_width / tau_init) ** 2 * np.ones(x_dim) + # GP noise variance + params_init['eps'] = eps_init * np.ones(x_dim) + + # ======================================== + # Initialize observation model parameters + # ======================================== + print('Initializing parameters using factor analysis...') + + y_all = np.hstack(seq_train_cut['y']) + fa = FactorAnalysis(n_components=x_dim, copy=True, + noise_variance_init=np.diag(np.cov(y_all, bias=True))) + fa.fit(y_all.T) + params_init['d'] = y_all.mean(axis=1) + params_init['C'] = fa.components_.T + params_init['R'] = np.diag(fa.noise_variance_) + + # Define parameter constraints + params_init['notes'] = { + 'learnKernelParams': True, + 'learnGPNoise': False, + 'RforceDiagonal': True, + } + + # ===================== + # Fit model parameters + # ===================== + print('\nFitting GPFA model...') + + params_est, seq_train_cut, ll_cut, iter_time = em( + params_init, seq_train_cut, min_var_frac=min_var_frac, + em_max_iters=em_max_iters, verbose=verbose) + + # Extract neural trajectories for original, unsegmented trials + # using learned parameters + seq_train, ll_train = exact_inference_with_ll(seq_train, params_est) + + if len(seq_test) > 0: + pass + # TODO: include the assessment of generalization performance + + fit_info = {'iteration_time': iter_time, 'log_likelihood': ll_train} + return params_est, seq_train, fit_info + + +def two_stage_engine(seqTrain, seqTest, typ='fa', xDim=3, binWidth=20.0): + raise NotImplemented diff --git a/elephant/gpfa_src/gpfa_util.py b/elephant/gpfa_src/gpfa_util.py new file mode 100644 index 000000000..6049ca3b9 --- /dev/null +++ b/elephant/gpfa_src/gpfa_util.py @@ -0,0 +1,596 @@ +# -*- coding: utf-8 -*- +""" +GPFA util functions. + +:copyright: Copyright 2015-2019 by the Elephant team, see AUTHORS.txt. +:license: Modified BSD, see LICENSE.txt for details. +""" + + +import warnings + +import numpy as np +import quantities as pq +import scipy as sp + +from elephant.conversion import BinnedSpikeTrain + + +def get_seq(data, bin_size, use_sqrt=True): + """ + Converts the data into a rec array using internally BinnedSpikeTrain. + + Parameters + ---------- + + data : list of list of Spiketrain objects + The outer list corresponds to trials and the inner list corresponds to + the neurons recorded in that trial, such that data[l][n] is the + Spiketrain of neuron n in trial l. Note that the number and order of + Spiketrains objects per trial must be fixed such that data[l][n] and + data[k][n] refer to the same spike generator for any choice of l,k and + n. + bin_size: quantity.Quantity + Spike bin width + + use_sqrt: bool + Boolean specifying whether or not to use square-root transform on + spike counts (see original paper for motivation). + Default is True + + Returns + ------- + + seq + data structure, whose nth entry (corresponding to the nth experimental + trial) has fields + * trialId: unique trial identifier + * T: (1 x 1) number of timesteps + * y: (yDim x T) neural data + + Raises + ------ + ValueError + if `bin_size` is not a pq.Quantity. + + """ + if not isinstance(bin_size, pq.Quantity): + raise ValueError("'bin_size' must be of type pq.Quantity") + + seq = [] + for dat in data: + trial_id = dat[0] + sts = dat[1] + binned_sts = BinnedSpikeTrain(sts, binsize=bin_size) + if use_sqrt: + binned = np.sqrt(binned_sts.to_array()) + else: + binned = binned_sts.to_array() + seq.append( + (trial_id, binned_sts.num_bins, binned)) + seq = np.array(seq, + dtype=[('trialId', np.int), ('T', np.int), + ('y', 'O')]) + + # Remove trials that are shorter than one bin width + if len(seq) > 0: + trials_to_keep = seq['T'] > 0 + seq = seq[trials_to_keep] + + return seq + + +def cut_trials(seq_in, seg_length=20): + """ + Extracts trial segments that are all of the same length. Uses + overlapping segments if trial length is not integer multiple + of segment length. Ignores trials with length shorter than + one segment length. + + Parameters + ---------- + + seq_in + data structure, whose nth entry (corresponding to + the nth experimental trial) has fields + * trialId: unique trial identifier + * T: (1 x 1) number of timesteps in trial + * y: (yDim x T) neural data + + seg_length : int + length of segments to extract, in number of timesteps. If infinite, + entire trials are extracted, i.e., no segmenting. + Default is 20 + + + Returns + ------- + + seqOut + data structure, whose nth entry (corresponding to + the nth experimental trial) has fields + * trialId: identifier of trial from which segment was taken + * segId: segment identifier within trial + * T: (1 x 1) number of timesteps in segment + * y: (yDim x T) neural data + + Raises + ------ + ValueError + If `seq_length == 0`. + + """ + if seg_length == 0: + raise ValueError("At least 1 extracted trial must be returned") + if np.isinf(seg_length): + seqOut = seq_in + return seqOut + + dtype_seqOut = [('trialId', np.int), ('segId', np.int), ('T', np.int), + ('y', np.object)] + seqOut_buff = [] + for n, seqIn_n in enumerate(seq_in): + T = seqIn_n['T'] + + # Skip trials that are shorter than segLength + if T < seg_length: + warnings.warn('trialId {0:4d} shorter than one segLength...' + 'skipping'.format(seqIn_n['trialId'])) + continue + + numSeg = np.int(np.ceil(float(T) / seg_length)) + + # Randomize the sizes of overlaps + if numSeg == 1: + cumOL = np.array([0, ]) + else: + totalOL = (seg_length * numSeg) - T + probs = np.ones(numSeg - 1, np.float) / (numSeg - 1) + randOL = np.random.multinomial(totalOL, probs) + cumOL = np.hstack([0, np.cumsum(randOL)]) + + seg = np.empty(numSeg, dtype_seqOut) + seg['trialId'] = seqIn_n['trialId'] + seg['T'] = seg_length + + for s, seg_s in enumerate(seg): + tStart = seg_length * s - cumOL[s] + + seg_s['segId'] = s + seg_s['y'] = seqIn_n['y'][:, tStart:tStart + seg_length] + + seqOut_buff.append(seg) + + if len(seqOut_buff) > 0: + seqOut = np.hstack(seqOut_buff) + else: + seqOut = np.empty(0, dtype_seqOut) + + return seqOut + + +def rdiv(a, b): + """ + Returns the solution to x b = a. Equivalent to MATLAB right matrix + division: a / b + """ + return np.linalg.solve(b.T, a.T).T + + +def logdet(A): + """ + log(det(A)) where A is positive-definite. + This is faster and more stable than using log(det(A)). + + Written by Tom Minka + (c) Microsoft Corporation. All rights reserved. + """ + U = np.linalg.cholesky(A) + return 2 * (np.log(np.diag(U))).sum() + + +def make_k_big(params, n_timesteps): + """ + Constructs full GP covariance matrix across all state dimensions and + timesteps. + + Parameters + ---------- + + params + GPFA model parameters + n_timesteps : int + number of timesteps + + Returns + ------- + + K_big + GP covariance matrix with dimensions (xDim * T) x (xDim * T). + The (t1, t2) block is diagonal, has dimensions xDim x xDim, + and represents the covariance between the state vectors at + timesteps t1 and t2. K_big is sparse and striped. + K_big_inv + Inverse of K_big + logdet_K_big + Log determinant of K_big + + Raises + ------ + ValueError + If `params['covType'] != 'rbf'`. + + """ + if params['covType'] != 'rbf': + raise ValueError("Only 'rbf' GP covariance type is supported.") + + xDim = params['C'].shape[1] + + K_big = np.zeros((xDim * n_timesteps, xDim * n_timesteps)) + K_big_inv = np.zeros((xDim * n_timesteps, xDim * n_timesteps)) + Tdif = np.tile(np.arange(0, n_timesteps), (n_timesteps, 1)).T \ + - np.tile(np.arange(0, n_timesteps), (n_timesteps, 1)) + logdet_K_big = 0 + + for i in range(xDim): + K = (1 - params['eps'][i]) * np.exp(-params['gamma'][i] / 2 * + Tdif ** 2) \ + + params['eps'][i] * np.eye(n_timesteps) + K_big[i::xDim, i::xDim] = K + # the original MATLAB program uses here a special algorithm, provided + # in C and MEX, for inversion of Toeplitz matrix: + # [K_big_inv(idx+i, idx+i), logdet_K] = invToeplitz(K); + # TODO: use an inversion method optimized for Toeplitz matrix + # Below is an attempt to use such a method, not leading to a speed-up. + # # K_big_inv[i::xDim, i::xDim] = sp.linalg.solve_toeplitz((K[:, 0], + # K[0, :]), np.eye(T)) + K_big_inv[i::xDim, i::xDim] = np.linalg.inv(K) + logdet_K = logdet(K) + + logdet_K_big = logdet_K_big + logdet_K + + return K_big, K_big_inv, logdet_K_big + + +def inv_persymm(M, blk_size): + """ + Inverts a matrix that is block persymmetric. This function is + faster than calling inv(M) directly because it only computes the + top half of inv(M). The bottom half of inv(M) is made up of + elements from the top half of inv(M). + + WARNING: If the input matrix M is not block persymmetric, no + error message will be produced and the output of this function will + not be meaningful. + + Parameters + ---------- + + M: numpy.ndarray + The block persymmetric matrix to be inverted + ((blkSize*T) x (blkSize*T)). + Each block is blkSize x blkSize, arranged in a T x T grid. + blk_size: int + Edge length of one block + + Returns + ------- + invM + Inverse of M ((blkSize*T) x (blkSize*T)) + logdet_M + Log determinant of M + """ + T = int(M.shape[0] / blk_size) + Thalf = np.int(np.ceil(T / 2.0)) + mkr = blk_size * Thalf + + invA11 = np.linalg.inv(M[:mkr, :mkr]) + invA11 = (invA11 + invA11.T) / 2 + + # Multiplication of a sparse matrix by a dense matrix is not supported by + # SciPy. Making A12 a sparse matrix here an error later. + off_diag_sparse = False + if off_diag_sparse: + A12 = sp.sparse.csr_matrix(M[:mkr, mkr:]) + else: + A12 = M[:mkr, mkr:] + + term = invA11.dot(A12) + F22 = M[mkr:, mkr:] - A12.T.dot(term) + + res12 = rdiv(-term, F22) + res11 = invA11 - res12.dot(term.T) + res11 = (res11 + res11.T) / 2 + + # Fill in bottom half of invM by picking elements from res11 and res12 + invM = fill_persymm(np.hstack([res11, res12]), blk_size, T) + + logdet_M = -logdet(invA11) + logdet(F22) + + return invM, logdet_M + + +def fill_persymm(p_in, blk_size, n_blocks, blk_size_vert=None): + """ + Fills in the bottom half of a block persymmetric matrix, given the + top half. + + Parameters + ---------- + + p_in + Top half of block persymmetric matrix (xDim*Thalf) x (xDim*T), + where Thalf = ceil(T/2) + blk_size : int + Edge length of one block + n_blocks : int + Number of blocks making up a row of Pin + blk_size_vert : int, optional + Vertical block edge length if blocks are not square. + `blk_size` is assumed to be the horizontal block edge length. + + Returns + ------- + + Pout + Full block persymmetric matrix (xDim*T) x (xDim*T) + """ + if blk_size_vert is None: + blk_size_vert = blk_size + + Nh = blk_size * n_blocks + Nv = blk_size_vert * n_blocks + Thalf = np.int(np.floor(n_blocks / 2.0)) + THalf = np.int(np.ceil(n_blocks / 2.0)) + + Pout = np.empty((blk_size_vert * n_blocks, blk_size * n_blocks)) + Pout[:blk_size_vert * THalf, :] = p_in + for i in range(Thalf): + for j in range(n_blocks): + Pout[Nv - (i + 1) * blk_size_vert:Nv - i * blk_size_vert, + Nh - (j + 1) * blk_size:Nh - j * blk_size] \ + = p_in[i * blk_size_vert:(i + 1) * + blk_size_vert, + j * blk_size:(j + 1) * blk_size] + + return Pout + + +def make_precomp(seq, xDim): + """ + Make the precomputation matrices specified by the GPFA algorithm. + + Usage: [precomp] = makePautoSum( seq , xDim ) + + Parameters + ---------- + + seq + The sequence struct of inferred latents, etc. + xDim : int + The dimension of the latent space. + + Returns + ------- + + precomp + The precomp struct will be updated with the posterior covaraince and + the other requirements. + + Notes + ----- + All inputs are named sensibly to those in `learnGPparams`. + This code probably should not be called from anywhere but there. + + We bother with this method because we + need this particular matrix sum to be + as fast as possible. Thus, no error checking + is done here as that would add needless computation. + Instead, the onus is on the caller (which should be + learnGPparams()) to make sure this is called correctly. + + Finally, see the notes in the GPFA README. + """ + + Tall = seq['T'] + Tmax = (Tall).max() + Tdif = np.tile(np.arange(0, Tmax), (Tmax, 1)).T \ + - np.tile(np.arange(0, Tmax), (Tmax, 1)) + + # assign some helpful precomp items + # this is computationally cheap, so we keep a few loops in MATLAB + # for ease of readability. + precomp = np.empty(xDim, dtype=[( + 'absDif', np.object), ('difSq', np.object), ('Tall', np.object), + ('Tu', np.object)]) + for i in range(xDim): + precomp[i]['absDif'] = np.abs(Tdif) + precomp[i]['difSq'] = Tdif ** 2 + precomp[i]['Tall'] = Tall + # find unique numbers of trial lengths + Tu = np.unique(Tall) + # Loop once for each state dimension (each GP) + for i in range(xDim): + precomp_Tu = np.empty(len(Tu), dtype=[( + 'nList', np.object), ('T', np.int), ('numTrials', np.int), + ('PautoSUM', np.object)]) + for j in range(len(Tu)): + T = Tu[j] + precomp_Tu[j]['nList'] = np.where(Tall == T)[0] + precomp_Tu[j]['T'] = T + precomp_Tu[j]['numTrials'] = len(precomp_Tu[j]['nList']) + precomp_Tu[j]['PautoSUM'] = np.zeros((T, T)) + precomp[i]['Tu'] = precomp_Tu + + # at this point the basic precomp is built. The previous steps + # should be computationally cheap. We now try to embed the + # expensive computation in a MEX call, defaulting to MATLAB if + # this fails. The expensive computation is filling out PautoSUM, + # which we initialized previously as zeros. + + ############################################################ + # Fill out PautoSum + ############################################################ + # Loop once for each state dimension (each GP) + for i in range(xDim): + # Loop once for each trial length (each of Tu) + for j in range(len(Tu)): + # Loop once for each trial (each of nList) + for n in precomp[i]['Tu'][j]['nList']: + precomp[i]['Tu'][j]['PautoSUM'] += seq[n]['VsmGP'][:, :, i] \ + + np.outer( + seq[n]['xsm'][i, :], seq[n]['xsm'][i, :]) + return precomp + + +def grad_betgam(p, pre_comp, const): + """ + Gradient computation for GP timescale optimization. + This function is called by minimize.m. + + Parameters + ---------- + + p + variable with respect to which optimization is performed, + where p = log(1 / timescale ^2) + pre_comp + structure containing precomputations + const + contains hyperparameters + + Returns + ------- + + f - value of objective function E[log P({x},{y})] at p + df - gradient at p + """ + Tall = pre_comp['Tall'] + Tmax = Tall.max() + + # temp is Tmax x Tmax + temp = (1 - const['eps']) * np.exp(-np.exp(p) / 2 * pre_comp['difSq']) + Kmax = temp + const['eps'] * np.eye(Tmax) + dKdgamma_max = -0.5 * temp * pre_comp['difSq'] + + dEdgamma = 0 + f = 0 + for j in range(len(pre_comp['Tu'])): + T = pre_comp['Tu'][j]['T'] + Thalf = np.int(np.ceil(T / 2.0)) + + Kinv = np.linalg.inv(Kmax[:T, :T]) + logdet_K = logdet(Kmax[:T, :T]) + + KinvM = Kinv[:Thalf, :].dot(dKdgamma_max[:T, :T]) # Thalf x T + KinvMKinv = (KinvM.dot(Kinv)).T # Thalf x T + + dg_KinvM = np.diag(KinvM) + tr_KinvM = 2 * dg_KinvM.sum() - np.fmod(T, 2) * dg_KinvM[-1] + + mkr = np.int(np.ceil(0.5 * T ** 2)) + numTrials = pre_comp['Tu'][j]['numTrials'] + PautoSUM = pre_comp['Tu'][j]['PautoSUM'] + + pauto_kinv_dot = PautoSUM.ravel('F')[:mkr].dot( + KinvMKinv.ravel('F')[:mkr]) + pauto_kinv_dot_rest = PautoSUM.ravel('F')[-1:mkr - 1:- 1].dot( + KinvMKinv.ravel('F')[:(T ** 2 - mkr)]) + dEdgamma = dEdgamma - 0.5 * numTrials * tr_KinvM \ + + 0.5 * pauto_kinv_dot \ + + 0.5 * pauto_kinv_dot_rest + + f = f - 0.5 * numTrials * logdet_K \ + - 0.5 * (PautoSUM * Kinv).sum() + + f = -f + # exp(p) is needed because we're computing gradients with + # respect to log(gamma), rather than gamma + df = -dEdgamma * np.exp(p) + + return f, df + + +def orthogonalize(x, l): + """ + + Orthonormalize the columns of the loading matrix and + apply the corresponding linear transform to the latent variables. + + yDim: data dimensionality + xDim: latent dimensionality + + Parameters + ---------- + + x : np.ndarray + Latent variables (xDim x T) + l : np.ndarray + Loading matrix (yDim x xDim) + + Returns + ------- + + Xorth + Orthonormalized latent variables (xDim x T) + Lorth + Orthonormalized loading matrix (yDim x xDim) + TT + Linear transform applied to latent variables (xDim x xDim) + """ + xDim = l.shape[1] + if xDim == 1: + TT = np.sqrt(np.dot(l.T, l)) + Lorth = rdiv(l, TT) + Xorth = np.dot(TT, x) + else: + UU, DD, VV = sp.linalg.svd(l, full_matrices=False) + # TT is transform matrix + TT = np.dot(np.diag(DD), VV) + + Lorth = UU + Xorth = np.dot(TT, x) + return Xorth, Lorth, TT + + +def segment_by_trial(seq, x, fn): + """ + Segment and store data by trial. + + Parameters + ---------- + + seq + Data structure that has field T, the number of timesteps + x : np.ndarray + Data to be segmented (any dimensionality x total number of timesteps) + fn + New field name of seq where segments of X are stored + + Returns + ------- + + seq_new + Data structure with new field `fn` + + Raises + ------ + ValueError + If `seq['T']) != x.shape[1]`. + + """ + if np.sum(seq['T']) != x.shape[1]: + raise (ValueError, 'size of X incorrect.') + + dtype_new = [(i, seq[i].dtype) for i in seq.dtype.names] + dtype_new.append((fn, np.object)) + seq_new = np.empty(len(seq), dtype=dtype_new) + for dtype_name in seq.dtype.names: + seq_new[dtype_name] = seq[dtype_name] + + ctr = 0 + for n, T in enumerate(seq['T']): + seq_new[n][fn] = x[:, ctr:ctr + T] + ctr += T + + return seq_new diff --git a/elephant/spike_train_generation.py b/elephant/spike_train_generation.py index d125bb8c6..7a80aecfd 100644 --- a/elephant/spike_train_generation.py +++ b/elephant/spike_train_generation.py @@ -331,7 +331,7 @@ def homogeneous_poisson_process(rate, t_start=0.0 * ms, t_stop=1000.0 * ms, """ if not isinstance(t_start, Quantity) or not isinstance(t_stop, Quantity): raise ValueError("t_start and t_stop must be of type pq.Quantity") - rate = rate.rescale((1 / t_start).units) + rate = rate.rescale(1 / t_start.units) mean_interval = 1 / rate.magnitude return _homogeneous_process( np.random.exponential, (mean_interval,), rate, t_start, t_stop, diff --git a/elephant/test/test_gpfa.py b/elephant/test/test_gpfa.py new file mode 100644 index 000000000..195a01383 --- /dev/null +++ b/elephant/test/test_gpfa.py @@ -0,0 +1,177 @@ +# -*- coding: utf-8 -*- +""" +Unit tests for the GPFA analysis. + +:copyright: Copyright 2014-2016 by the Elephant team, see AUTHORS.txt. +:license: Modified BSD, see LICENSE.txt for details. +""" + +import sys +import unittest + +import neo +import numpy as np +import quantities as pq +from numpy.testing import assert_array_equal, assert_array_almost_equal + +from elephant.spike_train_generation import homogeneous_poisson_process + +try: + import sklearn +except ImportError: + HAVE_SKLEARN = False +else: + HAVE_SKLEARN = True + from elephant.gpfa_src import gpfa_util + from elephant.gpfa import gpfa + +python_version_major = sys.version_info.major + + +@unittest.skipUnless(HAVE_SKLEARN, 'requires sklearn') +class GPFATestCase(unittest.TestCase): + def setUp(self): + def gen_gamma_spike_train(k, theta, t_max): + x = [] + for i in range(int(3 * t_max / (k*theta))): + x.append(np.random.gamma(k, theta)) + s = np.cumsum(x) + return s[s < t_max] + + def gen_test_data(rates, durs, shapes=(1, 1, 1, 1)): + s = gen_gamma_spike_train(shapes[0], 1./rates[0], durs[0]) + for i in range(1, 4): + s_i = gen_gamma_spike_train(shapes[i], 1./rates[i], durs[i]) + s = np.concatenate([s, s_i + np.sum(durs[:i])]) + return s + + self.n_iters = 10 + self.bin_size = 20 * pq.ms + + # generate data1 + rates_a = (2, 10, 2, 2) + rates_b = (2, 2, 10, 2) + durs = (2.5, 2.5, 2.5, 2.5) + self.data1 = [] + for tr in range(1): + np.random.seed(tr) + n1 = neo.SpikeTrain(gen_test_data(rates_a, durs), units=1*pq.s, + t_start=0*pq.s, t_stop=10*pq.s) + n2 = neo.SpikeTrain(gen_test_data(rates_a, durs), units=1*pq.s, + t_start=0*pq.s, t_stop=10*pq.s) + n3 = neo.SpikeTrain(gen_test_data(rates_b, durs), units=1*pq.s, + t_start=0*pq.s, t_stop=10*pq.s) + n4 = neo.SpikeTrain(gen_test_data(rates_b, durs), units=1*pq.s, + t_start=0*pq.s, t_stop=10*pq.s) + self.data1.append((tr, [n1, n2, n3, n4])) + self.x_dim = 4 + + # generate data2 + np.random.seed(27) + self.data2 = [] + n_trials = 10 + n_channels = 20 + for trial in range(n_trials): + rates = np.random.randint(low=1, high=100, size=n_channels) + spike_times = [homogeneous_poisson_process(rate=rate*pq.Hz) + for rate in rates] + self.data2.append((trial, spike_times)) + + def test_data1(self): + params_est, seqs_train, fit_info = gpfa( + self.data1, x_dim=self.x_dim, em_max_iters=self.n_iters) + self.assertEqual(fit_info['bin_size'], 20*pq.ms) + self.assertEqual(fit_info['min_var_frac'], 0.01) + self.assertTrue(np.all(fit_info['has_spikes_bool'])) + self.assertAlmostEqual(fit_info['log_likelihood'], -27.222600197474762) + # Since data1 is inherently 2 dimensional, only the first two + # dimensions of xorth should have finite power. + for i in [0, 1]: + self.assertNotEqual(seqs_train['xorth'][0][i].mean(), 0) + self.assertNotEqual(seqs_train['xorth'][0][i].var(), 0) + for i in [2, 3]: + self.assertEqual(seqs_train['xorth'][0][i].mean(), 0) + self.assertEqual(seqs_train['xorth'][0][i].var(), 0) + + def test_invalid_input_data(self): + invalid_data = [(0, [0, 1, 2])] + invalid_bin_size = 10 + with self.assertRaises(ValueError): + gpfa(data=invalid_data) + with self.assertRaises(ValueError): + gpfa(data=[]) + with self.assertRaises(ValueError): + gpfa(data=self.data2, bin_size=invalid_bin_size) + + def test_data2(self): + params_est, seqs_train, fit_info = gpfa( + self.data2, bin_size=self.bin_size, x_dim=8, + em_max_iters=self.n_iters) + self.assertEqual(fit_info['bin_size'], self.bin_size, + "Input and output bin_size don't match") + n_trials = len(self.data2) + t_start = self.data2[0][1][0].t_stop + t_stop = self.data2[0][1][0].t_start + n_bins = int(((t_start - t_stop) / self.bin_size).magnitude) + assert_array_equal(seqs_train['T'], [n_bins,] * n_trials) + assert_array_equal(seqs_train['trialId'], np.arange(n_trials)) + for key in ['y', 'xsm', 'Vsm', 'VsmGP', 'xorth']: + self.assertEqual(len(seqs_train[key]), n_trials, + msg="Failed ndarray field {0}".format(key)) + self.assertEqual(len(seqs_train), n_trials) + + def test_get_seq_sqrt(self): + data = [self.data2[0]] + seqs = gpfa_util.get_seq(data, bin_size=self.bin_size) + seqs_not_sqrt = gpfa_util.get_seq(data, bin_size=self.bin_size, + use_sqrt=False) + for common_key in ('trialId', 'T'): + self.assertEqual(seqs[common_key], seqs_not_sqrt[common_key]) + self.assertEqual(seqs['y'].shape, seqs_not_sqrt['y'].shape) + + def test_cut_trials_inf(self): + same_data = gpfa_util.cut_trials(self.data2, seg_length=np.Inf) + assert same_data is self.data2 + + def test_cut_trials_zero_length(self): + seqs = gpfa_util.get_seq(self.data2, bin_size=self.bin_size) + with self.assertRaises(ValueError): + gpfa_util.cut_trials(seqs, seg_length=0) + + def test_cut_trials_same_length(self): + data = [self.data2[0]] + seqs = gpfa_util.get_seq(data, bin_size=self.bin_size) + seg_length = seqs[0]['T'] + seqs_cut = gpfa_util.cut_trials(seqs, seg_length=seg_length) + assert_array_almost_equal(seqs[0]['y'], seqs_cut[0]['y']) + + @unittest.skipUnless(python_version_major == 3, "assertWarns requires 3.2") + def test_cut_trials_larger_length(self): + data = [self.data2[0]] + seqs = gpfa_util.get_seq(data, bin_size=self.bin_size) + seg_length = seqs[0]['T'] + 1 + with self.assertWarns(UserWarning): + gpfa_util.cut_trials(seqs, seg_length=seg_length) + + def test_logdet(self): + np.random.seed(27) + # generate a positive definite matrix + matrix = np.random.randn(20, 20) + matrix = matrix.dot(matrix.T) + logdet_fast = gpfa_util.logdet(matrix) + logdet_ground_truth = np.log(np.linalg.det(matrix)) + assert_array_almost_equal(logdet_fast, logdet_ground_truth) + + +def suite(): + suite = unittest.makeSuite(GPFATestCase, 'test') + return suite + + +def run(): + runner = unittest.TextTestRunner(verbosity=2) + runner.run(suite()) + + +if __name__ == "__main__": + unittest.main() diff --git a/requirements-extras.txt b/requirements-extras.txt index f46e91593..17fec19af 100644 --- a/requirements-extras.txt +++ b/requirements-extras.txt @@ -1,3 +1,4 @@ pandas>=0.14.1 scikit-learn mpi4py>=3.0.1 +tqdm