In [1]:
import pandas as pd 
import numpy as np
import os

# HASC-2011 dataset preprocess

In [8]:
raw_file = './data/hasc-111018-165936-acc.csv'
raw_label = './data/hasc-111018-165936-acc.label'

raw_df = pd.read_csv(raw_file, header=None)
raw_df.columns = ['time', 'x', 'y', 'z']
raw_label_df = pd.read_csv(raw_label, skiprows=1, header=None)
raw_label_df.columns = ['start', 'end', 'label']
print(f'raw shape: {raw_df.shape}, label shape: {raw_label_df.shape}')
print('Raw activity data')
print(raw_df.head())
print('\nraw label data')
print(raw_label_df.head())

raw shape: (39397, 4), label shape: (38, 3)
Raw activity data
          time         x         y         z
0  5015.672119  0.115128 -0.988739 -0.090057
1  5015.687371  0.110138 -0.986694 -0.086288
2  5015.704061  0.116180 -0.991669 -0.091003
3  5015.715389  0.112259 -0.989670 -0.091095
4  5015.726564  0.114349 -0.989746 -0.097855

raw label data
      start       end                    label
0  5071.934       NaN       move;escalator;B2F
1  5098.502  5126.499         escalatorUp;stay
2  5126.970       NaN           move;floor;B1F
3  5127.665  5143.411  walk;floor;B1F;steps;22
4  5147.988       NaN       move;escalator;B1F


In [3]:
# add index column
raw_df = raw_df.reset_index()
print(raw_df.head())

   index         time         x         y         z
0      0  5015.672119  0.115128 -0.988739 -0.090057
1      1  5015.687371  0.110138 -0.986694 -0.086288
2      2  5015.704061  0.116180 -0.991669 -0.091003
3      3  5015.715389  0.112259 -0.989670 -0.091095
4      4  5015.726564  0.114349 -0.989746 -0.097855


### L2 norm

In [4]:
raw_df['l2_norm'] = np.sqrt(np.square(raw_df[['x', 'y', 'z']]).sum(axis=1))
raw_df['l2_norm'].head()

0    0.999485
1    0.996565
2    1.002590
3    1.000174
4    1.001124
Name: l2_norm, dtype: float64

In [5]:
raw_df[['l2_norm']].to_csv('./data/preprocess/hasc_l2_norm.csv')

## create breakpoints index label

In [6]:
raw_time = raw_df['time'].to_numpy()
raw_label_start = raw_label_df['start'].to_numpy()

In [7]:
breakpoints_index = []
start_index = 0
for i in range(len(raw_time)):
    if start_index < len(raw_label_start) and raw_time[i] > raw_label_start[start_index]:
        breakpoints_index.append(i)
        start_index += 1
print(f'len: {len(breakpoints_index)}, value: {breakpoints_index}')

len: 40, value: [3959, 5305, 7812, 10325, 10565, 13084, 14136, 14184, 14411, 14931, 18428, 19901, 21086, 21538, 23834, 24236, 24469, 24818, 24884, 25214, 25597, 25710, 25780, 26510, 27408, 27431, 28116, 28396, 29188, 30016, 30072, 30407, 30769, 30858, 33567, 34033, 34265, 35529, 37511, 37787]


In [16]:
# save
np.savetxt('./data/preprocess/hasc_label_index.txt', breakpoints_index, fmt='%i')

# EEG dataset preprocess

In [3]:
os.path.abspath('')

'c:\\Users\\Minh Nhat\\Downloads\\DCU\\Practicum\\TIRE-custom'

In [10]:
# dirname = os.path.dirname(__file__) # it's not working with ipynb
dirname = os.path.abspath('')
eeg_training_data_folder = os.path.join(dirname, '../Data/grasp-and-lift-eeg-detection/train/')
print(eeg_training_data_folder)

c:\Users\Minh Nhat\Downloads\DCU\Practicum\TIRE-custom\../Data/grasp-and-lift-eeg-detection/train/


In [6]:
file_data_format = 'subj{}_series{}_data.csv'
file_events_format = 'subj{}_series{}_events.csv'
subject = 1
series = 2
print(f'file data format: {file_data_format.format(subject, series)}')
print(f'file events format: {file_events_format.format(subject, series)}')

file data format: subj1_series2_data.csv
file events format: subj1_series2_events.csv


In [24]:
events_1_1_df = pd.read_csv(os.path.join(eeg_training_data_folder, file_events_format.format(1, 1)))
events_1_1_df.shape

(119496, 7)

In [25]:
events_2_1_df = pd.read_csv(os.path.join(eeg_training_data_folder, file_events_format.format(2, 1)))
events_2_1_df.shape

(291874, 7)

In [26]:
events_1_1_df.shape[0] + events_2_1_df.shape[0]

411370

In [17]:
from typing import List

def create_files_by_template(base_folder: str, template: str, subjects: iter, series: int):
    result = []
    for subject in subjects:
        file_name = template.format(subject, series)
        result.append(os.path.join(base_folder, file_name))
    return result 

def concat_files_by_row(files: List[str]):
    def read_file_df(f: str):
        df = pd.read_csv(f)
        return df
    frames = [read_file_df(f) for f in files ]
    result = pd.concat(frames)
    return result 

print(f'test create_files_by_template')
list_file_label = create_files_by_template(eeg_training_data_folder, file_events_format, subjects=range(1, 3), series=1)
print(list_file_label)
combine_label_df = concat_files_by_row(list_file_label)
combine_label_df.shape

test create_files_by_template
['c:\\Users\\Minh Nhat\\Downloads\\DCU\\Practicum\\TIRE-custom\\../Data/grasp-and-lift-eeg-detection/train/subj1_series1_events.csv', 'c:\\Users\\Minh Nhat\\Downloads\\DCU\\Practicum\\TIRE-custom\\../Data/grasp-and-lift-eeg-detection/train/subj2_series1_events.csv']


(411370, 7)

In [29]:
def create_dataset(series: int):
    # create database by series. 
    # training: subject 1-> 8 / validation: 9, 10 / testing: 11, 12
    configs = [('training', range(1, 9)), ('validation', range(9, 11)), ('testing', range(11, 13))]

    if os.path.exists('./data/eeg_grasp_and_lift/') == False:
        os.makedirs('./data/eeg_grasp_and_lift/')
        
    for config in configs:
        print(f'type: {config[0]}')
        list_file_data = create_files_by_template(eeg_training_data_folder, file_data_format, subjects=config[1], series=series)
        combine_data_df = concat_files_by_row(list_file_data)
        combine_data_df.to_csv(f'./data/eeg_grasp_and_lift/dataset{series}_{config[0]}_data.csv', index=False)

        list_file_label = create_files_by_template(eeg_training_data_folder, file_events_format, subjects=config[1], series=series)
        combine_label_df = concat_files_by_row(list_file_label)
        combine_label_df.to_csv(f'./data/eeg_grasp_and_lift/dataset{series}_{config[0]}_label.csv', index=False)

# test create dataset 1
create_dataset(1)


type: training
type: validation
type: testing


In [30]:
for series in range(2, 9):
    create_dataset(series)

type: training
type: validation
type: testing
type: training
type: validation
type: testing
type: training
type: validation
type: testing
type: training
type: validation
type: testing
type: training
type: validation
type: testing
type: training
type: validation
type: testing
type: training
type: validation
type: testing
