In [1]:
# Objective:
# User-friendly function to load data and parameters and feed it into the training/application
# In this notebook I am preparing the code snippets to make up such a function

import numpy as np
import pandas as pd
from os import listdir
from numpy import genfromtxt

In [36]:
# Short clarification on terminalogy
# The term measurement refers to one entity of investigation, e.g. in a patient/animal/,
# within which  we want to classify something, e.g. behaviour, disease, ...
# The term dataset refers to a set of entities between one may want to study differences in classification
#============================================================================
# Input:

studyname = 'UCR' # to simplify switching between datasets when testing
# Minimum amount of user input need to interpret data:
if studyname == 'Utrecht':
    datadir = "/home/vincent/estep/data/utrecht"
    multivar = True #False # Are this Multivariate time series TRUE or FALSE
    labloc = "/home/vincent/estep/data/utrecht_labels.csv" # labels in column 1 (0), row 1 (1), or name of file
    idbloc = 2 # Id in column 1 (0), row 1 (1), seperate file (2), not applicable (3)
    labint = True # Is the label an integer?
    timecol = False # time series per column (True) or per row (False)
    
elif studyname == 'UCR':
    datadir = "/home/vincent/estep/data/UCR_TS_Archive_2015/50words"
    multivar = False # Are this Multivariate time series TRUE or FALSE
    labloc = 0 # Classifcation labels in column 1 (0), row 1 (1), seperate file (2)
    idbloc = 3 # Id in column 1 (0), row 1 (1), seperate file (2), not applicable (3)
    labint = True # Is the label an integer?
    timecol = False # time series per column (True) or per row (False)
#============================================================================

In [37]:
# Identify number of files and filetype based on datadir
filenames = listdir(datadir)
Nfiles = len(filenames) # number of files
TXTFILES = [m.endswith('.txt') for m in filenames] # check which files are textfiles
CSVFILES = [m.endswith('.csv') for m in filenames] # check which files are csvfiles

In [38]:
# Investigate what format the first file has by trying out a variety of reading attempts
path = datadir + '/' + filenames[1]
delimiter = [None,','] #possible delimiter values
skiprows=[0,1]
ntests = len(delimiter)*len(skiprows)
df = pd.DataFrame(index=range(ntests),columns=['delimiter','skiprows','nrow','ncol','first cell'])
cnt = 0
for di in delimiter:
    for si in skiprows:
        try:
            F1 = np.loadtxt(fname=path,delimiter=di,skiprows=si)
            df['delimiter'][cnt] = di
            df['skiprows'][cnt] = si
            df['nrow'][cnt] = F1.shape[0]
            df['ncol'][cnt] = F1.shape[1]
            df['first cell'][cnt] = F1[0,1]
        except:
            df['delimiter'][cnt] = di
            df['skiprows'][cnt] = si
            df['nrow'][cnt] = 0
            df['ncol'][cnt] = 0
            df['first cell'][cnt] = 0
        cnt = cnt + 1
# df is now a dataframe with information to help identify how the data should be loaded        
print(df)

  delimiter skiprows nrow ncol first cell
0      None        0    0    0          0
1      None        1    0    0          0
2         ,        0  455  271   -0.89094
3         ,        1  454  271   -0.78346


In [39]:
# load one file based on the extracted information on fileformat
form = df[df.nrow == max(df.nrow)] # extraction procedure that resulted in the largest number of rows is the best
if form.shape[0] > 1:
    form = df[df.ncol == max(df.ncol)] # extraction procedure that resulted in the largest number of columns

if (form['delimiter'] == ',').bool():
    F2 = np.loadtxt(fname=path,delimiter=',',skiprows=int(form['skiprows']))
else:
    F2 = np.loadtxt(fname=path,delimiter=None,skiprows=int(form['skiprows']))

In [40]:
F2[0:3,0:3]

array([[  4.     ,  -0.89094,  -0.86099],
       [ 12.     ,  -0.78346,  -0.68562],
       [ 13.     ,  -1.3256 ,  -1.2843 ]])

In [41]:
# Extract labels y and data X, and standardize shape of matrix
if type(labloc) == str:
    y = genfromtxt(labloc, delimiter=',',skip_header=1) # do we want numpy array or pd dataframe?
    #y=pd.read_csv(labloc, sep=',',header=None)
    # TO DO: y needs to be converted in simple one dimensional array that alligns with X
    # - how can we verify which time series links to which label...both should have a unique key
    # - in case of utrecht, the key is in the id in the filename, which is also in y
    # - is this generic for all seperate file?
    # - no, if there is one class per id then we know that it is file level
    # - if there are multiple classes per id (e.g. London data) then we know that it is both between and
    # within files
    if timecol == False:
        X = F2.transpose()
    else:
        X = F2    
elif type(labloc) == int:
    if labloc == 0:
        y = np.array(F2[:,0], dtype=labtype)
        X = F2[:,1:]
    elif labloc == 1:
        y = np.array(F2[0,:], dtype=labtype)
        X = F2[1:,:].transpose()   

In [42]:
# Extract data based on newly gained insight into fileformat and data structure
if labint == True:
    labtype = 'int'
else:
    labtype = 'str'

In [43]:
F2.transpose().shape

(271, 455)

In [44]:
y.shape

(455,)