In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import time

# Link to data to be extracted to path pointed by dir
# https://archive.ics.uci.edu/ml/datasets/mhealth+dataset

### Load all .log files

In [2]:
dir = Path("raw_data")
data_ext=".log"

t1s = time.time();
li = None
# Get all files in the data directory with
# appropriate extension
for file in dir.glob('**/*'+data_ext):
    # read each file
    data = pd.read_csv(file, delimiter='\t', header=None).values
    # and create a big list in memory
    if li is None:
        li = data
    else:
        li = np.vstack((li,data))

# separate data from labels
labels = li[:, 23].astype(int)

# eliminate null data
data = li[labels!=0, :23]
labels = labels[labels!=0]

t1e = time.time();

### Bagging strategy

In [5]:

# Define the window size and the stride
# Window size
WINDOW_SIZE = 10
# Stride
STRIDE = 100
t2s=time.time();
windows = None
window_labels = None
spinning_cursor = '|/-\\'
k=0
# iterate through windows
for i in range(0, data.shape[0]-WINDOW_SIZE, STRIDE):

    # logger
    print("",end='\r')
    print(r'Working: {per:6.2f}%'.format(per=100*(i+1)/(data.shape[0]-WINDOW_SIZE)),r'{cur}'.format(cur=spinning_cursor[int(t2s-time.time())%4]), end='')

    # Data stacking
    data_slice = data[i:i+WINDOW_SIZE].flatten()
    if np.isnan(data_slice).any():
        continue
    # # Data normalisation
    # data_slice = ((data_slice-data_slice.min(axis=0))/(data_slice.max(axis=0) - data_slice.min(axis=0))).flatten()
    # if np.isnan(data_slice).any():
    #     continue

    if windows is None:
        windows = data_slice
    else:
        windows = np.vstack((windows,data_slice))
    # print(windows.shape)
    # Label stacking
    l = np.zeros(12)
    for j in labels[i:i+WINDOW_SIZE]:
        l[j-1]+=1
    if window_labels is None:
        window_labels = l.argmax()
    else:
        window_labels = np.vstack((window_labels,l.argmax()))
    # print(window_labels.shape)

handled_data = np.hstack((windows,window_labels))
if np.isnan(handled_data).any():
    print("Hello")
print("\n")
t2e = time.time();

Working:  99.98% /



### Save data

In [6]:
dir = Path("data")
np.savetxt(dir/"data_all.csv", handled_data, delimiter=",")

In [None]:
handled_data.shape