In [2]:
# Objective:
# User-friendly function to load time series data and labels and feed it into the training/application
# In this notebook I am preparing the code snippets to make up such a function, and also to identify the
# possible needs for standardisation in input format

import numpy as np
import pandas as pd
from os import listdir
from numpy import genfromtxt

In [3]:
# Short clarification on terminalogy
# The term measurement refers to one entity of investigation, e.g. in a patient/animal/,
# within which  we want to classify something, e.g. behaviour, disease, ...
# The term dataset refers to a set of entities between one may want to study differences in classification
#============================================================================
# Input:

studyname = 'London' # to simplify switching between datasets when testing
# Minimum amount of user input need to interpret data:
if studyname == 'Utrecht':
    datadir = "/home/vincent/estep/data/utrecht"
    multivar = True #False # Are this Multivariate time series TRUE or FALSE
    multiclass = False
    Nfilesperelement = 4 #there are four files per patient
    labloc = "/home/vincent/estep/data/utrecht_labels.csv" # labels in column 1 (0), row 1 (1), or name of file
    idbloc = 2 # Id in column 1 (0), row 1 (1), seperate file (2), not applicable (3)
    labint = True # Is the label an integer?
    timecol = False # time series per column (True) or per row (False)
elif studyname == 'UCR':
    datadir = "/home/vincent/estep/data/UCR_TS_Archive_2015/50words"
    multivar = False # Are this Multivariate time series TRUE or FALSE
    multiclass = False
    Nfilesperelement = 0 #all elements are in one merged file
    labloc = 0 # Classifcation labels in column 1 (0), row 1 (1), seperate file (2)
    idbloc = 3 # Id in column 1 (0), row 1 (1), seperate file (2), not applicable (3)
    labint = True # Is the label an integer?
    timecol = False # time series per column (True) or per row (False)
elif studyname == 'London':
    datadir = "/home/vincent/estep/data/london/accelerometer_40Hz"
    multivar = True # Are this Multivariate time series TRUE or FALSE
    multiclass = True
    Nfilesperelement = 1 #one file per person
    labloc = "/home/vincent/estep/data/london_labels.csv" # Classifcation labels in column 1 (0), row 1 (1), seperate file (2)
    idbloc = 2 # Id in column 1 (0), row 1 (1), seperate file (2), not applicable (3)
    labint = False # Is the label an integer?
    timecol = True # time series per column (True) or per row (False)
    
    
# Other types of input we may need:
# - previously trained model: is this going to be a file?
# - what to do with output: store in file? add to python global workspace?
# - requests and restrictions on what architectures should be tried:
# - - N layers
# - - GPU or not
# - - training time limit
# - - LSTM (RNN) or not
# - - How to handle multi-variate input: as various dimensions of the same 
#     time series are as seperate branches of the network
# - - Performance report

# dummy dictionary just to sketch how the input would look like:
inputdict = {'previousmodel': None,'maxnlayers': 3,'gpu': False,'rnn': True,'verbose': False}

#============================================================================

In [4]:
inputdict

{'gpu': False,
 'maxnlayers': 3,
 'previousmodel': None,
 'rnn': True,
 'verbose': False}

In [5]:
inputdict

{'gpu': False,
 'maxnlayers': 3,
 'previousmodel': None,
 'rnn': True,
 'verbose': False}

In [6]:
# Identify number of files and filetype based on datadir
filenames = listdir(datadir)
Nfiles = len(filenames) # number of files
TXTFILES = [m.endswith('.txt') for m in filenames] # check which files are textfiles
CSVFILES = [m.endswith('.csv') for m in filenames] # check which files are csvfiles

In [7]:
# Investigate what format the first file has by trying out a variety of reading attempts
path = datadir + '/' + filenames[1]
delimiter = [None,','] #possible delimiter values
skiprows=[0,1]
ntests = len(delimiter)*len(skiprows)
df = pd.DataFrame(index=range(ntests),columns=['delimiter','skiprows','nrow','ncol','first cell'])
cnt = 0
for di in delimiter:
    for si in skiprows:
        try:
            F1 = np.loadtxt(fname=path,delimiter=di,skiprows=si)
            df['delimiter'][cnt] = di
            df['skiprows'][cnt] = si
            df['nrow'][cnt] = F1.shape[0]
            df['ncol'][cnt] = F1.shape[1]
            df['first cell'][cnt] = F1[0,1]
        except:
            df['delimiter'][cnt] = di
            df['skiprows'][cnt] = si
            df['nrow'][cnt] = 0
            df['ncol'][cnt] = 0
            df['first cell'][cnt] = 0
        cnt = cnt + 1
# df is now a dataframe with information to help identify how the data should be loaded        
print(df)

  delimiter skiprows     nrow ncol first cell
0      None        0        0    0          0
1      None        1        0    0          0
2         ,        0        0    0          0
3         ,        1  3456000    3   0.651919


In [8]:
# load one file based on the extracted information on fileformat
form = df[df.nrow == max(df.nrow)] # extraction procedure that resulted in the largest number of rows is the best
if form.shape[0] > 1:
    form = df[df.ncol == max(df.ncol)] # extraction procedure that resulted in the largest number of columns

if (form['delimiter'] == ',').bool():
    F2 = np.loadtxt(fname=path,delimiter=',',skiprows=int(form['skiprows']))
else:
    F2 = np.loadtxt(fname=path,delimiter=None,skiprows=int(form['skiprows']))

In [9]:
F2[0:3,0:3]

array([[ 0.45943237,  0.65191934, -0.61158976],
       [ 0.47129694,  0.63217007, -0.63543481],
       [ 0.46734208,  0.64401963, -0.64338316]])

In [10]:
# Extract labels y and data X, and standardize shape of matrix
# Extract data based on newly gained insight into fileformat and data structure
if labint == True:
    labtype = 'int'
else:
    labtype = 'str'
if type(labloc) == str:
    #y = genfromtxt(labloc, delimiter=',',skip_header=1) # do we want numpy array or pd dataframe?
    y = pd.read_csv(labloc, sep=',',header=0)
    #y=pd.read_csv(labloc, sep=',',header=None)
    # TO DO: y needs to be converted in simple one dimensional array that alligns with X
    # - how can we verify which time series links to which label...both should have a unique key
    # - in case of utrecht, the key is in the id in the filename, which is also in y
    # - is this generic for all seperate file?
    # - no, if there is one class per id then we know that it is file level
    # - if there are multiple classes per id (e.g. London data) then we know that it is both between and
    # within files
    if timecol == False:
        X = F2.transpose()
    else:
        X = F2    
elif type(labloc) == int:
    if labloc == 0:
        y = np.array(F2[:,0], dtype=labtype)
        X = F2[:,1:]
    elif labloc == 1:
        y = np.array(F2[0,:], dtype=labtype)
        X = F2[1:,:].transpose()   
        


In [11]:
type(y)

pandas.core.frame.DataFrame

In [10]:
print((y.shape, X.shape))

((235, 11), (3456000, 3))


In [11]:
# Reformat labels to be useful for Keras (in progress):
# we need label size to be the same dimension as the first dimension of the data (which typically is the number
# of time series)

if type(labloc) == str and multiclass == True and y.shape[0] != X.shape[0]:
    #filter y relevant for this filename, this is relevant for the london labels
    y2 = y[y['filename'] == filenames[1].strip('.csv')] #TO DO: make code less specific to London, e.g by
    # requiring that identifiers are compatible and that the user can specify the name of the identifier
    #now convert this into something with the same shape as X, such that we always have a standardized output
    
    # convert clock times to samples since start of measurement
    #y2['stime'] = y2['stime']
    if (y2.shape[0] == X.shape[0]): #no more modification needed, use y2
        y = y2
    else:
        y3 = np.zeros(X.shape[0]) #initialize right shape of y
        #for i in range(y2.shape[0]):
         # y3[y2['stime'][i]:y2['etime'][i]] = [y2['activity'][i]
            
# the following classification is incorrect, because:
# in utrecht data there are not four files per person
if y.shape[0] == X.shape[0]: # no more action needed, annotation fits shape of data
    print('there is one x with all the time series and an y with corresponding label per time series')
elif y.shape[0]*Nfilesperelement == Nfiles: #classification is  per file
    print('there are multiple X, one for each data file, and y holds one label for each time series')
else:
    print('there are multiple X, one for each data file, and y may either hold one or more labels for each time series')

there are multiple X, one for each data file, and y may either hold one or more labels for each time series


In [13]:
y2

Unnamed: 0,ddate1,tudday,tudm,stime,etime,activity,actp,Monitor,ddate1num,distance,filename
0,01/08/2014,1,App,04:00:00,12:00:00,Sleepingandresting(includingsickinbed),Sleepingandresting(includingsickinbed),16563,1406844000,18000,__016563_2014-08-08 12-02-19.bin_day1.RData
1,01/08/2014,1,App,12:00:00,19:00:00,"WatchTV,DVDs,downloadedvideos","WatchingTV,DVDs,downloadedvideos",16563,1406844000,18000,__016563_2014-08-08 12-02-19.bin_day1.RData
2,01/08/2014,1,App,19:00:00,00:00:00,"Speaking,socialisingface-to-face","Speaking,socialisingface-to-face",16563,1406844000,18000,__016563_2014-08-08 12-02-19.bin_day1.RData
3,01/08/2014,1,App,00:00:00,04:00:00,"WatchTV,DVDs,downloadedvideos","WatchingTV,DVDs,downloadedvideos",16563,1406844000,18000,__016563_2014-08-08 12-02-19.bin_day1.RData


In [72]:
#lets remove some columns
# y2 = y2.drop('tudday',1)
# y2 = y2.drop('Monitor',1)
# y2 = y2.drop('distance',1)
y2

Unnamed: 0,ddate1,tudm,stime,etime,activity,actp,ddate1num,filename,stime_numeric,etime_numeric,difftime_numeric
0,01/08/2014,App,04:00:00,12:00:00,Sleepingandresting(includingsickinbed),Sleepingandresting(includingsickinbed),1406844000,__016563_2014-08-08 12-02-19.bin_day1.RData,1406858000.0,1406887000.0,1406858000.0
1,01/08/2014,App,12:00:00,19:00:00,"WatchTV,DVDs,downloadedvideos","WatchingTV,DVDs,downloadedvideos",1406844000,__016563_2014-08-08 12-02-19.bin_day1.RData,1406887000.0,1406912000.0,1406887000.0
2,01/08/2014,App,19:00:00,00:00:00,"Speaking,socialisingface-to-face","Speaking,socialisingface-to-face",1406844000,__016563_2014-08-08 12-02-19.bin_day1.RData,1406912000.0,1406844000.0,1406912000.0
3,01/08/2014,App,00:00:00,04:00:00,"WatchTV,DVDs,downloadedvideos","WatchingTV,DVDs,downloadedvideos",1406844000,__016563_2014-08-08 12-02-19.bin_day1.RData,1406844000.0,1406858000.0,1406844000.0


In [74]:
# Exploring how to convert the London label timestamps into something with which we can easily
# Extract data from the time series

import time
labelsbyindex = np.zeros((y2.shape[0],3))
pattern = '%d/%m/%Y %H:%M:%S'
for j in range(y2.shape[0]):
    date_time = [str(y2['ddate1'][j]).strip() + " " + str(y2['stime'][j]).strip(),
        str(y2['ddate1'][j]).strip() + " " + str(y2['etime'][j]).strip()]
    labelsbyindex[j,0:2] = np.array([int(time.mktime(time.strptime(date_time[i], pattern))) for i in range(2)])
    labelsbyindex[j,2]  = (labelsbyindex[j,1] - labelsbyindex[j,0]) / 3600

y2['stime_numeric'] = labelsbyindex[:,0]
y2['etime_numeric'] = labelsbyindex[:,1]
y2['difftime_numeric'] = labelsbyindex[:,0]
print(date_time)
print(epoch)
print(labelsbyindex)

['01/08/2014 00:00:00', '01/08/2014 04:00:00']
[1406858400, 1406887200]
[[  1.40685840e+09   1.40688720e+09   8.00000000e+00]
 [  1.40688720e+09   1.40691240e+09   7.00000000e+00]
 [  1.40691240e+09   1.40684400e+09  -1.90000000e+01]
 [  1.40684400e+09   1.40685840e+09   4.00000000e+00]]


In [66]:
y2['stime_numeric']

0    1.406858e+09
1    1.406887e+09
2    1.406912e+09
3    1.406844e+09
Name: stime_numeric, dtype: float64

In [None]:
# next step for London labels:
# slide the time series using the labels
# generate y and X such that first dimension of X equals y

In [1]:
type(y)

NameError: name 'y' is not defined