Hello.

Here is code to fix gaps in the files. The functions are from the pipeline, but I'm going to define everything locally in this notebook, so that it's self contained. Here's all the stolen stuff:

In [10]:
from pathlib import Path

import os
import math
import numpy as np
import scipy.signal
import scipy.io
import time
import struct
from copy import deepcopy

import matplotlib.pylab as plt

# constants
NUM_HEADER_BYTES = 1024
SAMPLES_PER_RECORD = 1024
BYTES_PER_SAMPLE = 2
RECORD_SIZE = 4 + 8 + SAMPLES_PER_RECORD * BYTES_PER_SAMPLE + 10 # size of each continuous record in bytes
RECORD_MARKER = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 255])

# constants for pre-allocating matrices:
MAX_NUMBER_OF_SPIKES = int(1e6)
MAX_NUMBER_OF_RECORDS = int(1e6)
MAX_NUMBER_OF_EVENTS = int(1e6)


def readHeader(f):
    header = { }
    h = f.read(1024).decode().replace('\n','').replace('header.','')
    for i,item in enumerate(h.split(';')):
        if '=' in item:
            header[item.split(' = ')[0]] = item.split(' = ')[1]
    return header


def loadContinuous(filepath, dtype = float):

    assert dtype in (float, np.int16), \
        'Invalid data type specified for loadContinous, valid types are float and np.int16'

    #print("Loading continuous data...")

    ch = { }

    #read in the data
    f = open(filepath,'rb')

    fileLength = os.fstat(f.fileno()).st_size

    # calculate number of samples
    recordBytes = fileLength - NUM_HEADER_BYTES
    if  recordBytes % RECORD_SIZE != 0:
        raise Exception("File size is not consistent with a continuous file: may be corrupt")
    nrec = recordBytes // RECORD_SIZE
    nsamp = nrec * SAMPLES_PER_RECORD
    # pre-allocate samples
    samples = np.zeros(nsamp, dtype)
    timestamps = np.zeros(nrec)
    recordingNumbers = np.zeros(nrec)
    indices = np.arange(0, nsamp + 1, SAMPLES_PER_RECORD, np.dtype(np.int64))

    header = readHeader(f)

    recIndices = np.arange(0, nrec)

    for recordNumber in recIndices:

        timestamps[recordNumber] = np.fromfile(f,np.dtype('<i8'),1) # little-endian 64-bit signed integer
        N = np.fromfile(f,np.dtype('<u2'),1)[0] # little-endian 16-bit unsigned integer

        #print index

        if N != SAMPLES_PER_RECORD:
            raise Exception('Found corrupted record in block ' + str(recordNumber))

        recordingNumbers[recordNumber] = (np.fromfile(f,np.dtype('>u2'),1)) # big-endian 16-bit unsigned integer

        if dtype == float: # Convert data to float array and convert bits to voltage.
            data = np.fromfile(f,np.dtype('>i2'),N) * float(header['bitVolts']) # big-endian 16-bit signed integer, multiplied by bitVolts
        else:  # Keep data in signed 16 bit integer format.
            data = np.fromfile(f,np.dtype('>i2'),N)  # big-endian 16-bit signed integer
        samples[indices[recordNumber]:indices[recordNumber+1]] = data

        marker = f.read(10) # dump

    #print recordNumber
    #print index

    ch['header'] = header
    ch['timestamps'] = timestamps
    ch['data'] = samples  # OR use downsample(samples,1), to save space
    ch['recordingNumber'] = recordingNumbers
    f.close()
    return ch


def writeFrame(f, timestamp, recording_num, x):
    byteWritten = 0
    if x.size==1024:      
        byteWritten += f.write(np.array(timestamp).astype('<i8').tobytes())
        byteWritten += f.write(np.array(1024).astype('<i2').tobytes())
        byteWritten += f.write(np.array(recording_num).astype('<i2').tobytes())
        byteWritten += f.write(x.astype('>i2').tobytes())
        byteWritten += f.write(np.array([0,1,2,3,4,5,6,7,8,255]).astype(np.byte).tobytes())
    else:
        print('Data point not correct. Skipped')
    return byteWritten
    

def writeHeader(f,header):
    headerstr=''
    for k,v in header.items():
        k = 'header.'+k.strip()
        headerstr = headerstr+'{0} = {1};\n'.format(k,v)
    headerstr=headerstr.ljust(1024)
    f.write(headerstr.encode('ascii'))

def writeContinuousFile(fname,header,timestamp,x,recording_num=None,dtype=np.float64):
    f = open(fname,'wb')
    writeHeader(f,header)

    noFrame = x.size//1024
    
    if dtype == np.float64:
        #convert back the value to int according to the bitVolts
        x = np.round(x/np.float64(header['bitVolts']))
    
    for i in range(noFrame):
        if recording_num is not None:
            writeFrame(f,timestamp[i],recording_num[i],x[i*1024:(i+1)*1024])
        else:
            writeFrame(f,timestamp[i],0,x[i*1024:(i+1)*1024])
    
    f.close()

And own functions ( I can write code too! )

This function has two purposes: it can find gaps and it can delete them.

Note: We need to search for the gaps. But sometimes the recording is 0 for a few samples (biggest I've seen was 4). So I define a gap as being at least 10 zeros.

In [8]:
how_many_zeros_is_a_gap = 10

def delete_blanks(raw_data):

    locations = []

    new_raw_data = raw_data

    zero_locations = np.nonzero((new_raw_data == 0)*1)[0]

    j = 0
    while j < len(zero_locations):
        i = zero_locations[j]
        count = 1
        while (count < len(new_raw_data)-i) and new_raw_data[i+count] == 0.:
            count = count + 1
        if count > how_many_zeros_is_a_gap:
            # if you wanna see where the gap is.
            # print(i, count)
            new_raw_data = np.delete(new_raw_data, np.s_[i:i+count])
            zero_locations = np.nonzero((new_raw_data == 0)*1)[0]
            locations.append(i)
        else:
            j = j + count
        count = 1

    return new_raw_data, locations

Here's the list of bad files

In [None]:
bad_files = [
'cohort8_may2021/vr/M10_D15_2021-05-28_10-06-58',
'cohort8_may2021/vr/M15_D27_2021-06-15_13-06-29',
'cohort8_may2021/vr/M15_D13_2021-05-26_12-05-34',
'cohort8_may2021/vr/M15_D16_2021-05-31_12-40-03',
'cohort8_may2021/vr/M13_D29_2021-06-17_11-50-37',
'cohort8_may2021/vr/M14_D41_2021-07-05_12-37-48',
'cohort8_may2021/vr/M14_D41_2021-07-05_12-37-48',
'cohort8_may2021/vr/M13_D18_2021-06-02_11-50-48',
'cohort8_may2021/vr/M11_D29_2021-06-17_10-35-48',
'cohort8_may2021/vr/M10_D16_2021-05-31_09-42-48',
'cohort8_may2021/vr/M12_D7_2021-05-18_10-28-40',
'cohort8_may2021/vr/M15_D9_2021-05-20_12-37-07',
'cohort8_may2021/vr/M12_D24_2021-06-10_11-24-37',
'cohort8_may2021/vr/M14_D37_2021-06-29_12-33-24',
'cohort8_may2021/vr/M14_D33_2021-06-23_12-22-49',
'cohort8_may2021/vr/M11_D14_2021-05-27_10-34-15',
'cohort8_may2021/vr/M11_D17_2021-06-01_10-36-53',
'cohort8_may2021/vr/M13_D11_2021-05-24_11-11-59',
'cohort8_may2021/of/M14_D13_2021-05-26_10-51-36',
'cohort8_may2021/of/M12_D29_2021-06-17_10-31-00',
'cohort7_october2020/vr/M4_D15_2020-11-16_14-59-09',
'cohort7_october2020/vr/M3_D30_2020-12-07_14-55-41',
'cohort7_october2020/vr/M3_D19_2020-11-22_14-48-51',
'cohort7_october2020/vr/M4_D1_2020-10-29_14-21-55',
'cohort7_october2020/vr/M7_D20_2020-11-23_15-47-55',
'cohort7_october2020/vr/M4_D9_2020-11-08_15-18-01',
'cohort7_october2020/vr/M4_D12_2020-11-13_15-08-51',
'cohort7_october2020/vr/M3_D22_2020-11-27_15-01-24',
'cohort7_october2020/vr/M3_D18_2020-11-21_14-29-49',
'cohort7_october2020/vr/M4_D7_2020-11-06_14-58-44',
'cohort7_october2020/vr/M4_D2_2020-10-30_15-23-49',
'cohort7_october2020/vr/M3_D25_2020-11-30_15-13-15',
'cohort7_october2020/vr/M6_D8_2020-11-07_15-49-19',
'cohort7_october2020/vr/M6_D9_2020-11-08_15-53-11',
'cohort7_october2020/vr/M4_D14_2020-11-15_14-58-40',
'cohort7_october2020/vr/M6_D1_2020-10-29_15-04-09',
'cohort7_october2020/vr/M6_D13_2020-11-14_15-40-42',
'cohort7_october2020/vr/M7_D28_2020-12-05_16-15-10',
'cohort7_october2020/vr/M3_D6_2020-11-05_14-37-17',
'cohort7_october2020/vr/M4_D23_2020-11-28_15-45-12',
'cohort7_october2020/vr/M4_D8_2020-11-07_15-13-58',
'cohort7_october2020/vr/M4_D20_2020-11-23_15-08-48',
'cohort7_october2020/vr/M3_D4_2020-11-01_14-12-26',
'cohort7_october2020/vr/M7_D5_2020-11-02_15-52-51',
'cohort7_october2020/vr/M7_D8_2020-11-07_16-24-43',
'cohort7_october2020/vr/M4_D10_2020-11-09_15-04-26',
'cohort7_october2020/vr/M6_D26_2020-12-03_16-42-12',
'cohort7_october2020/vr/M3_D33_2020-12-12_15-02-16',
'cohort7_october2020/vr/M6_D11_2020-11-12_15-39-10',
'cohort7_october2020/vr/M4_D22_2020-11-27_15-38-49',
'cohort7_october2020/vr/M4_D3_2020-10-31_14-46-29',
'cohort7_october2020/vr/M4_D28_2020-12-05_15-44-27',
'cohort7_october2020/vr/M6_D12_2020-11-13_15-44-22',
'cohort7_october2020/of/M6_D8_2020-11-07_16-23-00',
'cohort6_july2020/vr/M1_D9_2020-08-13_15-16-48',
'cohort6_july2020/vr/M1_D6_2020-08-10_14-17-21'
            ]

First, we can check for gaps. I've already run this, so you don't have to!

Another assumption here: I've just looked at the CH1 file, assuming that any gap the same for all channels.

In [None]:
# Point it to the datastore, however you do this
oe_folder = Path("/Volumes/cmvm/sbms/groups/CDBS_SIDB_storage/NolanLab/ActiveProjects/Harry")

for bad_file in bad_files:
    
    data_dict = loadContinuous(oe_folder / bad_file / '100_CH1.continuous', dtype=np.int16)

    print(f"{bad_file}, ", end="")
    
    raw_data = data_dict['data']
    _, locs = delete_blanks(data_dict['data'])

    if len(locs) > 0:
    
        if locs[0] + 1024 == len(raw_data):
            print("just has a blank chunk at the end.", end="")
        else:
            print(f"len = {len(raw_data)/30000/60} ", end="")
            for loc in locs:
                if loc + 1024 == len(raw_data):
                    print("and a blank chunk at the end.", end="")
                else:
                    print(f", gap = {(loc)/30000/60}", end="")
        print()

    else:

        print("no gaps!")

I found the following bad files

In [2]:
actually_bad_files = [
'cohort8_may2021/vr/M13_D29_2021-06-17_11-50-37',
'cohort8_may2021/of/M12_D29_2021-06-17_10-31-00',
'cohort7_october2020/vr/M3_D6_2020-11-05_14-37-17',
'cohort6_july2020/vr/M1_D9_2020-08-13_15-16-48',
'cohort6_july2020/vr/M1_D6_2020-08-10_14-17-21',
'cohort8_may2021/of/M14_D13_2021-05-26_10-51-36'
]

For these ones, we can use the same function to get rid of the chunks, with a 'lil modification

In this code, we create a new folder with the same name but with "_fixed" appended to the name. You can do whatever you want. I, obviously, wouldn't delete the original data yet!

In [44]:
# A little local test for me
#
# oe_folder = Path('/Users/chris/Work/Edinburgh/Spike/data/Harry/')
# actually_bad_files = ['M1_D6_2020-08-10_14-17-21']

for in_folder in actually_bad_files:

    out_folder = in_folder + '_fixed'
    if not os.path.isdir(oe_folder / out_folder):
        os.mkdir(oe_folder / out_folder )

    for a in range(1,3):

        filename = "100_CH" + str(a) + ".continuous"

        data_dict = loadContinuous(oe_folder / in_folder / filename, dtype = np.int16)

        raw_data = data_dict['data']
        new_raw_data, _ = delete_blanks(raw_data)

        timestamps = data_dict['timestamps']
        new_timestamps = (timestamps[0] + np.arange(0,len(new_raw_data)+1024,1024))

        f = open(oe_folder / in_folder / filename, 'rb')
        header = readHeader(f)

        writeContinuousFile(oe_folder / out_folder / filename, header, new_timestamps, new_raw_data, dtype=np.int16)


  timestamps[recordNumber] = np.fromfile(f,np.dtype('<i8'),1) # little-endian 64-bit signed integer
  recordingNumbers[recordNumber] = (np.fromfile(f,np.dtype('>u2'),1)) # big-endian 16-bit unsigned integer


We can re-check the fixed data...

In [51]:
# A little local test for me
#
# oe_folder = Path('/Users/chris/Work/Edinburgh/Spike/data/Harry/')
# bad_files = ['M1_D6_2020-08-10_14-17-21_fixed']

for bad_file in bad_files:

    data_dict = loadContinuous(oe_folder / bad_file / '100_CH1.continuous', dtype=np.int16)

    print(f"{bad_file}, ", end="")

    raw_data = data_dict['data']
    _, locs = delete_blanks(data_dict['data'])

    if len(locs) > 0:

        if locs[0] + 1024 == len(raw_data):
            print("just has a blank chunk at the end.", end="")
        else:
            print(f"len = {len(raw_data)/30000/60} ", end="")
            for loc in locs:
                if loc + 1024 == len(raw_data):
                    print("and a blank chunk at the end.", end="")
                else:
                    print(f", gap = {(loc)/30000/60}", end="")
        print()

    else:
        
        print("no gaps!")

  timestamps[recordNumber] = np.fromfile(f,np.dtype('<i8'),1) # little-endian 64-bit signed integer
  recordingNumbers[recordNumber] = (np.fromfile(f,np.dtype('>u2'),1)) # big-endian 16-bit unsigned integer


M1_D6_2020-08-10_14-17-21_fixed, no gaps!


GOOD TIMES!