# Naive Bayes run on full dataset

## User Options

Define what folder you're saving to

In [1]:
# save_folder=''
save_folder='/home/jglaser/Files/Neural_Decoding/Results/'

Define what folder you're loading the files from

In [2]:
# load_folder=''
# load_folder='/Users/jig289/Dropbox/Public/Decoding_Data/'
load_folder='/home/jglaser/Data/DecData/'

Define what dataset you are using

In [3]:
# dataset='s1'
# dataset='m1'
dataset='hc'

## 1. Import Packages

We import standard packages and functions from the accompanying .py files

In [4]:
#Import standard packages
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import io
from scipy import stats
import pickle
import time
import sys

#Add the main folder to the path, so we have access to the files there.
#Note that if your working directory is not the Paper_code folder, you may need to manually specify the path to the main folder. For example: sys.path.append('/home/jglaser/GitProj/Neural_Decoding')
sys.path.append('..') 

#Import function to get the covariate matrix that includes spike history from previous bins
from preprocessing_funcs import get_spikes_with_history

#Import metrics
from metrics import get_R2
from metrics import get_rho

#Import decoder functions
from decoders import NaiveBayesDecoder

#Import Bayesian Optimization package
from bayes_opt import BayesianOptimization

  from pandas.core import datetools
Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX TITAN X (CNMeM is enabled with initial size: 45.0% of memory, cuDNN Mixed dnn version. The header is from one version, but we link with a different version (5103, 5110))
  from ._conv import register_converters as _register_converters


In [5]:
#Turn off deprecation warnings

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

## 2. Load Data

The data that we load is in the format described below. We have another example script, "Example_format_data" that may be helpful towards putting the data in this format.

Neural data should be a matrix of size "number of time bins" x "number of neurons", where each entry is the firing rate of a given neuron in a given time bin

The output you are decoding should be a matrix of size "number of time bins" x "number of features you are decoding"

In [6]:
if dataset=='s1':
    with open(load_folder+'example_data_s1.pickle','rb') as f:
#         neural_data,vels_binned=pickle.load(f,encoding='latin1')
        neural_data,vels_binned=pickle.load(f)
        
if dataset=='m1':
    with open(load_folder+'example_data_m1.pickle','rb') as f:
#         neural_data,vels_binned=pickle.load(f,encoding='latin1')
        neural_data,vels_binned=pickle.load(f)
        
if dataset=='hc':
    with open(load_folder+'example_data_hc.pickle','rb') as f:
#         neural_data,pos_binned=pickle.load(f,encoding='latin1')
        neural_data,pos_binned=pickle.load(f)

## 3. Preprocess Data

### 3A. User Inputs
The user can define what time period to use spikes from (with respect to the output).

In [7]:
if dataset=='s1':
    bins_before=1 #How many bins of neural data prior to the output are used for decoding
    bins_current=1 #Whether to use concurrent time bin of neural data
    bins_after=1 #How many bins of neural data after (and including) the output are used for decoding
    
if dataset=='m1':
    bins_before=2 #How many bins of neural data prior to the output are used for decoding
    bins_current=1 #Whether to use concurrent time bin of neural data
    bins_after=0 #How many bins of neural data after (and including) the output are used for decoding
    
if dataset=='hc':
    bins_before=4 #How many bins of neural data prior to the output are used for decoding
    bins_current=1 #Whether to use concurrent time bin of neural data
    bins_after=5 #How many bins of neural data after (and including) the output are used for decoding

### 3B. Format Covariates

#### Format Input Covariates

In [8]:
#Remove neurons with too few spikes in HC dataset
if dataset=='hc':
    nd_sum=np.nansum(neural_data,axis=0)
    rmv_nrn=np.where(nd_sum<100)
    neural_data=np.delete(neural_data,rmv_nrn,1)

In [9]:
X=neural_data

In [10]:
#Number of bins to sum spikes over
N=bins_before+bins_current+bins_after 

#### Format Output Covariates

In [11]:
#Set decoding output
if dataset=='s1' or dataset=='m1':
    y=vels_binned
if dataset=='hc':
    y=pos_binned

#### In HC dataset, remove time bins with no output (y value)

In [12]:
if dataset=='hc':
    #Remove time bins with no output (y value)
    rmv_time=np.where(np.isnan(y[:,0]) | np.isnan(y[:,1]))
    X=np.delete(X,rmv_time,0)
    y=np.delete(y,rmv_time,0)

** In HC dataset, there is a long period without movement starting at ~80%, so we only use the first 80% of the data**

In [13]:
if dataset=='hc':
    X=X[:int(.8*X.shape[0]),:]
    y=y[:int(.8*y.shape[0]),:]

### 3C. Define training/testing/validation sets
We have 10 cross-validation folds. In each fold, 10% of the data is a test set, 10% is a validation set, and 80% is the training set. So in the first fold, for example, 0-10% is validation, 10-20% is testing, and 20-100% is training.

In [14]:
valid_range_all=[[0,.1],[.1,.2],[.2,.3],[.3,.4],[.4,.5],
                 [.5,.6],[.6,.7],[.7,.8],[.8,.9],[.9,1]]
testing_range_all=[[.1,.2],[.2,.3],[.3,.4],[.4,.5],[.5,.6],
                 [.6,.7],[.7,.8],[.8,.9],[.9,1],[0,.1]]
#Note that the training set is not aways contiguous. For example, in the second fold, the training set has 0-10% and 30-100%.
#In that example, we enter of list of lists: [[0,.1],[.3,1]]
training_range_all=[[[.2,1]],[[0,.1],[.3,1]],[[0,.2],[.4,1]],[[0,.3],[.5,1]],[[0,.4],[.6,1]],
                   [[0,.5],[.7,1]],[[0,.6],[.8,1]],[[0,.7],[.9,1]],[[0,.8]],[[.1,.9]]]

num_folds=len(valid_range_all) #Number of cross validation folds

## 4. Run CV

**Initialize lists of results**

In [15]:
#R2 values
mean_r2_nb=np.empty(num_folds)

#Actual data
y_test_all=[]
y_train_all=[]
y_valid_all=[]

#Test predictions
y_pred_nb_all=[]

#Training predictions
y_train_pred_nb_all=[]

#Validation predictions
y_valid_pred_nb_all=[]

**In the following section, we**
1. Loop across folds
2. Extract the training/validation/testing data
3. Preprocess the data
4. Run the individual decoders (whichever have been specified in user options). This includes the hyperparameter optimization
5. Save the results

Note that the Wiener Filter, Wiener Cascade, and XGBoost decoders are commented most fully. So look at those for the best understanding.

In [17]:
t1=time.time() #If I want to keep track of how much time has elapsed

num_examples=X.shape[0] #number of examples (rows in the X matrix)


for i in range(num_folds): #Loop through the folds

    ######### SPLIT DATA INTO TRAINING/TESTING/VALIDATION #########
    
    #Note that all sets have a buffer of"bins_before" bins at the beginning, and "bins_after" bins at the end
    #This makes it so that the different sets don't include overlapping neural data
    
    #Get testing set for this fold
    testing_range=testing_range_all[i]
    testing_set=np.arange(np.int(np.round(testing_range[0]*num_examples))+bins_before,np.int(np.round(testing_range[1]*num_examples))-bins_after)

    #Get validation set for this fold
    valid_range=valid_range_all[i]
    valid_set=np.arange(np.int(np.round(valid_range[0]*num_examples))+bins_before,np.int(np.round(valid_range[1]*num_examples))-bins_after)

    #Get training set for this fold. 
    #Note this needs to take into account a non-contiguous training set (see section 3C)
    training_ranges=training_range_all[i]
    for j in range(len(training_ranges)): #Go through different separated portions of the training set
        training_range=training_ranges[j]
        if j==0: #If it's the first portion of the training set, make it the training set
            training_set=np.arange(np.int(np.round(training_range[0]*num_examples))+bins_before,np.int(np.round(training_range[1]*num_examples))-bins_after)
        if j==1: #If it's the second portion of the training set, concatentate it to the first
            training_set_temp=np.arange(np.int(np.round(training_range[0]*num_examples))+bins_before,np.int(np.round(training_range[1]*num_examples))-bins_after)
            training_set=np.concatenate((training_set,training_set_temp),axis=0)
                
    #Get training data
    X_train=X[training_set,:]
    y_train=y[training_set,:]
    
    #Get testing data
    X_test=X[testing_set,:]
    y_test=y[testing_set,:]

    #Get validation data
    X_valid=X[valid_set,:]
    y_valid=y[valid_set,:]

    
    
    ##### PREPROCESS DATA #####
    #Combine data across specified bins
    #Get total number of spikes across "bins_before","bins_current",and "bins_after"

    #Initialize matrices for neural data in Naive bayes format
    num_nrns=X_train.shape[1]
    X_b_train=np.empty([X_train.shape[0]-N+1,num_nrns])
    X_b_valid=np.empty([X_valid.shape[0]-N+1,num_nrns])
    X_b_test=np.empty([X_test.shape[0]-N+1,num_nrns])

    #Below assumes that bins_current=1 (otherwise alignment will be off by 1 between the spikes and outputs)
    
    #For all neurons, within all the bins being used, get the total number of spikes (sum across all those bins)
    #Do this for the training/validation/testing sets
    for k in range(num_nrns):
        X_b_train[:,k]=N*np.convolve(X_train[:,k], np.ones((N,))/N, mode='valid') #Convolving w/ ones is a sum across those N bins
        X_b_valid[:,k]=N*np.convolve(X_valid[:,k], np.ones((N,))/N, mode='valid')
        X_b_test[:,k]=N*np.convolve(X_test[:,k], np.ones((N,))/N, mode='valid')

    #Make integer format
    X_b_train=X_b_train.astype(int)
    X_b_valid=X_b_valid.astype(int)
    X_b_test=X_b_test.astype(int)

    #Make y's aligned w/ X's
    #e.g. we have to remove the first y if we are using 1 bin before, and have to remove the last y if we are using 1 bin after

    if bins_before>0 and bins_after>0:
        y_train=y_train[bins_before:-bins_after,:]
        y_valid=y_valid[bins_before:-bins_after,:]
        y_test=y_test[bins_before:-bins_after,:]

    if bins_before>0 and bins_after==0:
        y_train=y_train[bins_before:,:]
        y_valid=y_valid[bins_before:,:]
        y_test=y_test[bins_before:,:]
    
    
    
    ################# DECODING #################
    
    #Add actual train/valid/test data to lists (for saving)
    y_test_all.append(y_test)
    y_train_all.append(y_train)
    y_valid_all.append(y_valid)

    

    #Set "res", the number of bins used (resolution) for decoding predictions
    #So if res=100, we create a 100 x 100 grid going from the minimum to maximum of the output variables
    #The prediction the decoder makes will be a value on that grid 
    if dataset=='hc':
        res=100
    if dataset=='m1' or dataset=='s1':
        res=50    
    

    #Declare model
    model_nb=NaiveBayesDecoder(encoding_model='quadratic',res=res) #Use quadratic encoding model and resolution set above

    #Fit model
    model_nb.fit(X_b_train,y_train)

    #Get predictions
    y_test_predicted_nb=model_nb.predict(X_b_test,y_test)   
    mean_r2_nb[i]=np.mean(get_R2(y_test,y_test_predicted_nb))    
    #Print R2 values on test set
    R2s_nb=get_R2(y_test,y_test_predicted_nb)
    print('R2s:', R2s_nb)
    
    #Add predictions of test set to lists (for saving)
    #We're saving predictions on the training/validation sets since we're not including this decoder in our ensemble method
    y_pred_nb_all.append(y_test_predicted_nb)
#     y_train_pred_nb_all.append(model_nb.predict(X_b_train,y_train))
#     y_valid_pred_nb_all.append(model_nb.predict(X_b_valid,y_valid))  
    
        
    
    time_elapsed=time.time()-t1 #How much time has passed
    print("time_elapsed:",time_elapsed)
    
    
    ###### SAVE RESULTS #####
    #Note that I save them after every cross-validation fold rather than at the end in case the code/computer crashes for some reason while running
    
    with open(save_folder+dataset+'_results_nb3.pickle','wb') as f:
        pickle.dump([mean_r2_nb,y_pred_nb_all],f)

                        
#Save ground truth results
with open(save_folder+dataset+'_ground_truth_nb.pickle','wb') as f:
    pickle.dump([y_test_all,y_train_all,y_valid_all],f)

('R2s:', array([0.53720645, 0.79702178]))
('time_elapsed:', 52.453953981399536)
('time_elapsed:', 52.453953981399536)


### Quick check of results

In [None]:
mean_r2_nb

In [None]:
plt.plot(y_test_all[1][0:1000,0])
plt.plot(y_pred_nb_all[1][0:1000,0])