In [1]:
# Objective:
# User-friendly function to load data and parameters and feed it into the training/application
# In this notebook I am preparing the code snippets to make up such a function

import numpy as np
import pandas as pd
from os import listdir

In [2]:
# The term measurement refers to one entity of investigation within which  we want to classify something
# e.g. in a patient/animal/... we want to classify behaviour, disease, ...
# The term dataset refers to a set of entities between we may want to study differences in classification
#============================================================================
# Input:

studyname = 'Utrecht'
# Minimum amount of user input need to interpret data:
if studyname == 'Utrecht':
    datadir = "/home/vincent/estep/data/utrecht"
    multivar = True #False # Are this Multivariate time series TRUE or FALSE
    labloc = 2 #0 # Classifcation labels in column 1 (0), row 1 (1), seperate file (2)
    idbloc = 2 #3 # Id in column 1 (0), row 1 (1), seperate file (2), not applicable (3)
    labint = True # Is the label an integer?
    timecol = False # time series in columns
    
elif studyname == 'UCR':
    datadir = "/home/vincent/estep/data/UCR_TS_Archive_2015/50words"
    multivar = False # Are this Multivariate time series TRUE or FALSE
    labloc = 0 # Classifcation labels in column 1 (0), row 1 (1), seperate file (2)
    idbloc = 3 # Id in column 1 (0), row 1 (1), seperate file (2), not applicable (3)
    labint = True # Is the label an integer?
    timecol = False # time series in columns
#============================================================================

In [15]:
# Identify number of files and filetype

filenames = listdir(datadir)
Nfiles = len(filenames) # number of files
TXTFILES = [m.endswith('.txt') for m in filenames] # check which files are textfiles
CSVFILES = [m.endswith('.csv') for m in filenames] # check which files are csvfiles

In [16]:
# Investigate what the format of the first file is in by trying out a variety of reading attempts
path = datadir + '/' + filenames[1]
delimeter = [None,',']
skiprows=[0,1]
ntests = len(delimeter)*len(skiprows)
df = pd.DataFrame(index=range(ntests),columns=['delimiter','skiprows','nrow','ncol','first cell'])
cnt = 0
for di in delimeter:
    for si in skiprows:
        try:
            F1 = np.loadtxt(fname=path,delimiter=di,skiprows=si)
            df['delimiter'][cnt] = di
            df['skiprows'][cnt] = si
            df['nrow'][cnt] = F1.shape[0]
            df['ncol'][cnt] = F1.shape[1]
            df['first cell'][cnt] = F1[0,1]
        except:
            df['delimiter'][cnt] = di
            df['skiprows'][cnt] = si
            df['nrow'][cnt] = 0
            df['ncol'][cnt] = 0
            df['first cell'][cnt] = 0
        cnt = cnt + 1
# df is now a dataframe with information to help identify how the data should be loaded        
print(df)

  delimiter skiprows  nrow ncol first cell
0      None        0  4667   21   -0.78125
1      None        1  4666   21    -5.7617
2         ,        0     0    0          0
3         ,        1     0    0          0


In [18]:
# load one file based on the extracted information on fileformat
form = df[df.nrow == max(df.nrow)] # extraction procedure that resulted in the largest number of rows is the best
if form.shape[0] > 1:
    form = df[df.ncol == max(df.ncol)] # extraction procedure that resulted in the largest number of columns


if (form['delimiter'] == ',').bool():
    F2 = np.loadtxt(fname=path,delimiter=',',skiprows=int(form['skiprows']))
else:
    F2 = np.loadtxt(fname=path,delimiter=None,skiprows=int(form['skiprows']))

In [19]:
F2[0:6,0:6]

array([[  2.9297  ,  -0.78125 ,  -2.9297  ,   6.1523  ,   2.9297  ,  -3.418   ],
       [  3.7109  ,  -5.7617  ,  -4.6875  ,   5.6641  ,   3.2227  ,
         -4.9805  ],
       [ -0.39062 ,  -8.3008  ,  -5.6641  ,   4.6875  ,   3.6133  ,
         -6.0547  ],
       [ -7.2266  ,  -7.6172  ,  -6.4453  ,   0.48828 ,   3.7109  ,
         -8.7891  ],
       [ -8.3984  ,  -4.9805  ,  -7.0312  ,  -3.7109  ,   3.7109  , -10.352   ],
       [ -8.5937  ,  -0.097656,  -7.5195  ,  -4.8828  ,   4.1992  ,
         -8.0078  ]])

In [22]:
# Extract data based on newly gained insight into fileformat and data structure
if labint == True:
    labtype = 'int'
else:
    labtype = 'str'
    
# extract labels y and data X, and standardize shape of matrix
if labloc == 0:
    y = np.array(F2[:,0], dtype=labtype)
    X = F2
elif labloc == 1:
    y = np.array(F2[0,:], dtype=labtype)
    X = F2.transpose()
elif labloc == 2:
    y = 0 # <<=== load the file in which the labels are stored...TO DO
    if timecol == False:
        X = F2.transpose()
    else:
        X = F2        


In [23]:
F2.transpose().shape

(21, 4667)