In [1]:
import pandas as pd 
import numpy as np
import os
import matplotlib.pyplot as plt

In [None]:
def plot_channels(raw_df: pd.DataFrame, column_data_str: str, labels):
    f, ax = plt.subplots(len(column_data_str), 1, figsize=(30, 3 * len(column_data_str)))
    f.tight_layout(pad=2)
    timeseries_len = raw_df.shape[0]
    for index, column_name in enumerate(column_data_str):
        ax[index].plot(range(timeseries_len), raw_df[column_name])
        ax[index].set_title(f"channel: {column_name}", fontsize=20)

        height_line = 1
        ax[index].fill_between(range(timeseries_len), 0, height_line, where=labels > 0, color='red', alpha=0.2, transform=ax[index].get_xaxis_transform())
    plt.show()

# HASC-2011 dataset preprocess

In [8]:
raw_file = './data/hasc-111018-165936-acc.csv'
raw_label = './data/hasc-111018-165936-acc.label'

raw_df = pd.read_csv(raw_file, header=None)
raw_df.columns = ['time', 'x', 'y', 'z']
raw_label_df = pd.read_csv(raw_label, skiprows=1, header=None)
raw_label_df.columns = ['start', 'end', 'label']
print(f'raw shape: {raw_df.shape}, label shape: {raw_label_df.shape}')
print('Raw activity data')
print(raw_df.head())
print('\nraw label data')
print(raw_label_df.head())

raw shape: (39397, 4), label shape: (38, 3)
Raw activity data
          time         x         y         z
0  5015.672119  0.115128 -0.988739 -0.090057
1  5015.687371  0.110138 -0.986694 -0.086288
2  5015.704061  0.116180 -0.991669 -0.091003
3  5015.715389  0.112259 -0.989670 -0.091095
4  5015.726564  0.114349 -0.989746 -0.097855

raw label data
      start       end                    label
0  5071.934       NaN       move;escalator;B2F
1  5098.502  5126.499         escalatorUp;stay
2  5126.970       NaN           move;floor;B1F
3  5127.665  5143.411  walk;floor;B1F;steps;22
4  5147.988       NaN       move;escalator;B1F


In [3]:
# add index column
raw_df = raw_df.reset_index()
print(raw_df.head())

   index         time         x         y         z
0      0  5015.672119  0.115128 -0.988739 -0.090057
1      1  5015.687371  0.110138 -0.986694 -0.086288
2      2  5015.704061  0.116180 -0.991669 -0.091003
3      3  5015.715389  0.112259 -0.989670 -0.091095
4      4  5015.726564  0.114349 -0.989746 -0.097855


### L2 norm

In [4]:
raw_df['l2_norm'] = np.sqrt(np.square(raw_df[['x', 'y', 'z']]).sum(axis=1))
raw_df['l2_norm'].head()

0    0.999485
1    0.996565
2    1.002590
3    1.000174
4    1.001124
Name: l2_norm, dtype: float64

In [5]:
raw_df[['l2_norm']].to_csv('./data/preprocess/hasc_l2_norm.csv')

## create breakpoints index label

In [6]:
raw_time = raw_df['time'].to_numpy()
raw_label_start = raw_label_df['start'].to_numpy()

In [7]:
breakpoints_index = []
start_index = 0
for i in range(len(raw_time)):
    if start_index < len(raw_label_start) and raw_time[i] > raw_label_start[start_index]:
        breakpoints_index.append(i)
        start_index += 1
print(f'len: {len(breakpoints_index)}, value: {breakpoints_index}')

len: 40, value: [3959, 5305, 7812, 10325, 10565, 13084, 14136, 14184, 14411, 14931, 18428, 19901, 21086, 21538, 23834, 24236, 24469, 24818, 24884, 25214, 25597, 25710, 25780, 26510, 27408, 27431, 28116, 28396, 29188, 30016, 30072, 30407, 30769, 30858, 33567, 34033, 34265, 35529, 37511, 37787]


In [16]:
# save
np.savetxt('./data/preprocess/hasc_label_index.txt', breakpoints_index, fmt='%i')

# EEG dataset preprocess

In [3]:
os.path.abspath('')

'c:\\Users\\Minh Nhat\\Downloads\\DCU\\Practicum\\TIRE-custom'

In [2]:
# dirname = os.path.dirname(__file__) # it's not working with ipynb
dirname = os.path.abspath('')
eeg_training_data_folder = os.path.join(dirname, '../Data/grasp-and-lift-eeg-detection/train/')
print(eeg_training_data_folder)

file_data_format = 'subj{}_series{}_data.csv'
file_events_format = 'subj{}_series{}_events.csv'
subject = 1
series = 2
print(f'file data format: {file_data_format.format(subject, series)}')
print(f'file events format: {file_events_format.format(subject, series)}')

c:\Users\Minh Nhat\Downloads\DCU\Practicum\TIRE-custom\../Data/grasp-and-lift-eeg-detection/train/
file data format: subj1_series2_data.csv
file events format: subj1_series2_events.csv


In [3]:
from typing import List

def create_files_by_template(base_folder: str, template: str, subjects: iter, series: int):
    result = []
    for subject in subjects:
        file_name = template.format(subject, series)
        result.append(os.path.join(base_folder, file_name))
    return result 

def concat_files_by_row(files: List[str]):
    def read_file_df(f: str):
        df = pd.read_csv(f)
        return df
    frames = [read_file_df(f) for f in files ]
    result = pd.concat(frames)
    return result 

# print(f'test create_files_by_template')
# list_file_label = create_files_by_template(eeg_training_data_folder, file_events_format, subjects=range(1, 3), series=1)
# print(list_file_label)
# combine_label_df = concat_files_by_row(list_file_label)
# combine_label_df.shape

In [4]:
def create_dataset(series: int):
    # create database by series. 
    # training: subject 1-> 8 / validation: 9, 10 / testing: 11, 12
    configs = [('training', range(1, 9)), ('validation', range(9, 11)), ('testing', range(11, 13))]

    if os.path.exists('./data/eeg_grasp_and_lift/') == False:
        os.makedirs('./data/eeg_grasp_and_lift/')
        
    for config in configs:
        print(f'type: {config[0]}')
        list_file_data = create_files_by_template(eeg_training_data_folder, file_data_format, subjects=config[1], series=series)
        combine_data_df = concat_files_by_row(list_file_data)
        combine_data_df.to_csv(f'./data/eeg_grasp_and_lift/dataset{series}_{config[0]}_data.csv', index=False)

        list_file_label = create_files_by_template(eeg_training_data_folder, file_events_format, subjects=config[1], series=series)
        combine_label_df = concat_files_by_row(list_file_label)
        combine_label_df.to_csv(f'./data/eeg_grasp_and_lift/dataset{series}_{config[0]}_label.csv', index=False)

# test create dataset 1
# create_dataset(2)


In [5]:
for series in range(3, 9):
    create_dataset(series)

type: training
type: validation
type: testing
type: training
type: validation
type: testing
type: training
type: validation
type: testing
type: training
type: validation
type: testing
type: training
type: validation
type: testing
type: training
type: validation
type: testing


# Generated dataset 

## Jump mean

In [6]:
import simulate 
import utils

utils.setup_random_seed()

For formulation, check the TIRE paper at page 5
Target: examine what data circumstances DMD should be chosen over L2 ?
- [Done] Case 1: For jumpmean 
    - 1 real jump mean
    - 2 noise channels (still use AutoRegressive but with constant mean and variance). 
- Case 2: For jumpman
    - 1 real jump mean 
    - 2 noise channels, but use Gausian noise 
- Case 3: Channels are nearly the same in distribution shape, but different in magnitude 
    - Case 3.1: 3 channels jump mean 
    - Case 3.2: 3 channels Gaussian 
- Case 4: Mean and variance changed 
    - 1 channel mean 
    - 1 channel variance 
    - 1 channels noise 

In [7]:
## case 1 
def generate_jumpingmean_ar_noise(nr_cp=49, delta_t_cp = 100, delta_t_cp_std = 10, window_size=20):
    """
    Generates one instance of a jumping mean time series, together with the corresponding windows and parameters
    """
    mu = np.zeros((nr_cp,))
    parameters_jumpingmean = []
    for n in range(1,nr_cp):
        mu[n] = mu[n-1] + n / 16 #np.random.randn()*np.sqrt(delta_t_cp_std**3)
    # print(f'mu: {mu}')
    for n in range(nr_cp):
        nr = int(delta_t_cp+ np.random.randn()*np.sqrt(delta_t_cp_std))
        parameters_jumpingmean.extend(mu[n]*np.ones((nr,)))
    
    parameters_jumpingmean = np.array([parameters_jumpingmean]).T

    channels = 3
    ts_length = len(parameters_jumpingmean)
    all_timeseries = []
    for channel in range(channels):
        timeseries = np.zeros((ts_length))
        for i in range(2,ts_length):
            if channel == 0:
            #print(ar2(timeseries[i-1],timeseries[i-2], 0.6,-0.5, parameters_jumpingmean[i], 1.5))
                timeseries[i] = simulate.ar2(timeseries[i-1],timeseries[i-2], 0.6,-0.5, parameters_jumpingmean[i], 1.5)
            elif channel == 1:
                timeseries[i] = simulate.ar2(timeseries[i-1],timeseries[i-2], 0.9,-0.7, -1, 5.5)
            else:
                timeseries[i] = simulate.ar2(timeseries[i-1],timeseries[i-2], 0.6,-0.4, 0, 3.5)
        
        all_timeseries.append(timeseries)
    
    breakpoints = utils.parameters_to_cps(parameters_jumpingmean, window_size) # len(breakpoints) = len(timeseries) - 2*window_size + 1
    full_breakpoints =  np.concatenate([[0] * (window_size - 1), breakpoints , [0] * window_size])

    b_index = np.where(full_breakpoints > 0)[0]
    full_breakpoints[b_index] = [1] * len(b_index)
    
    return np.array(all_timeseries), full_breakpoints

### case 2 
def generate_jumpingmean_gaussian_noise(nr_cp=49, delta_t_cp = 100, delta_t_cp_std = 10, window_size=20):
    """
    Generates one instance of a jumping mean time series, together with the corresponding windows and parameters
    """
    mu = np.zeros((nr_cp,))
    parameters_jumpingmean = []
    for n in range(1,nr_cp):
        mu[n] = mu[n-1] + n / 16 #np.random.randn()*np.sqrt(delta_t_cp_std**3)
    # print(f'mu: {mu}')
    for n in range(nr_cp):
        nr = int(delta_t_cp+ np.random.randn()*np.sqrt(delta_t_cp_std))
        parameters_jumpingmean.extend(mu[n]*np.ones((nr,)))
    
    parameters_jumpingmean = np.array([parameters_jumpingmean]).T

    channels = 3
    ts_length = len(parameters_jumpingmean)
    all_timeseries = []
    for channel in range(channels):
        timeseries = np.zeros((ts_length))
        for i in range(2,ts_length):
            if channel == 0:
            #print(ar2(timeseries[i-1],timeseries[i-2], 0.6,-0.5, parameters_jumpingmean[i], 1.5))
                timeseries[i] = simulate.ar2(timeseries[i-1],timeseries[i-2], 0.6,-0.5, parameters_jumpingmean[i], 1.5)
            elif channel == 1:
                timeseries[i] = np.random.normal(-1, 5.5)
            else:
                timeseries[i] = np.random.normal(0, 3.5)
        
        all_timeseries.append(timeseries)
    
    breakpoints = utils.parameters_to_cps(parameters_jumpingmean, window_size) # len(breakpoints) = len(timeseries) - 2*window_size + 1
    full_breakpoints =  np.concatenate([[0] * (window_size - 1), breakpoints , [0] * window_size])

    b_index = np.where(full_breakpoints > 0)[0]
    full_breakpoints[b_index] = [1] * len(b_index)
    
    return np.array(all_timeseries), full_breakpoints



In [None]:
# Case 3: Channels are nearly the same in distribution shape, but different in magnitude 
# Case 3.1: 3 channels jump mean 
def generate_jumpingmean_diff_magnitude(nr_cp=49, delta_t_cp = 100, delta_t_cp_std = 10, window_size=20):
    """
    Generates one instance of a jumping mean time series, together with the corresponding windows and parameters
    """
    mu = np.zeros((nr_cp,))
    parameters_jumpingmean = []
    for n in range(1,nr_cp):
        mu[n] = mu[n-1] + n / 16 #np.random.randn()*np.sqrt(delta_t_cp_std**3)
    # print(f'mu: {mu}')
    for n in range(nr_cp):
        nr = int(delta_t_cp+ np.random.randn()*np.sqrt(delta_t_cp_std))
        parameters_jumpingmean.extend(mu[n]*np.ones((nr,)))
    
    parameters_jumpingmean = np.array([parameters_jumpingmean]).T

    channels = 3
    ts_length = len(parameters_jumpingmean)
    all_timeseries = []
    for channel in range(channels):
        timeseries = np.zeros((ts_length))
        for i in range(2,ts_length):
            if channel == 0:
            #print(ar2(timeseries[i-1],timeseries[i-2], 0.6,-0.5, parameters_jumpingmean[i], 1.5))
                timeseries[i] = simulate.ar2(timeseries[i-1],timeseries[i-2], 0.6,-0.5, parameters_jumpingmean[i], 1.5)
            elif channel == 1:
                timeseries[i] = simulate.ar2(timeseries[i-1],timeseries[i-2], 0.9,-0.7, parameters_jumpingmean[i], 5.5)
            else:
                timeseries[i] = simulate.ar2(timeseries[i-1],timeseries[i-2], 0.8,-0.1, parameters_jumpingmean[i], 3.5)
        
        all_timeseries.append(timeseries)
    
    breakpoints = utils.parameters_to_cps(parameters_jumpingmean, window_size) # len(breakpoints) = len(timeseries) - 2*window_size + 1
    full_breakpoints =  np.concatenate([[0] * (window_size - 1), breakpoints , [0] * window_size])

    b_index = np.where(full_breakpoints > 0)[0]
    full_breakpoints[b_index] = [1] * len(b_index)
    
    return np.array(all_timeseries), full_breakpoints

In [18]:
def saving_single_generate_dataset(timeseries, breakpoints, dataset_number: int, datatype: str, folder_prefix: str):
    utils.create_folder_if_not_exist(f'./data-gen/{folder_prefix}')

    timeseries_df = pd.DataFrame(timeseries.T)
    timeseries_df.columns = [f'col_{col}' for col in timeseries_df.columns]
    timeseries_df.to_csv(f'./data-gen/{folder_prefix}/{folder_prefix}-dataset{dataset_number}-{datatype}-data.csv', index=False)
    

    breakpoints_df = pd.DataFrame(breakpoints)
    breakpoints_df.columns = [f'col_{col}' for col in breakpoints_df.columns]
    breakpoints_df.to_csv(f'./data-gen/{folder_prefix}/{folder_prefix}-dataset{dataset_number}-{datatype}-label.csv', index=False)

'''
generated_type:
case 1: jumpmean_ar
case 2: jumpmean_gauss
case 3.1: jumpmean_same_dis
case 3.2: gauss_same_dis
case 4: jummpmean_scalevariance
'''
def saving_generate_datasets(generated_type: str):
    # if generated_type == 'jumpmean_ar' or generated_type == 'case1':
    #     generate_dataset_f = generate_jumpingmean_ar_noise
    #     folder_prefix = 'jumpmean_ar'
    if generated_type == 'jumpmean_gauss' or generated_type == 'case2':
        generate_dataset_f = generate_jumpingmean_gaussian_noise
        folder_prefix = 'jumpmean-gauss' # for folder convention 
    else:
        print('Please check generated_type parameter')
        return 
    for dataset_number in range(1, 11):
        for t in [('training', 49), ('validation', 49), ('testing', 49)]:

            timeseries, breakpoints = generate_dataset_f(nr_cp = t[1], window_size=20) # 20 for jump mean
            saving_single_generate_dataset(timeseries, breakpoints, dataset_number, t[0], folder_prefix)

In [19]:
saving_generate_datasets('jumpmean_gauss')

In [20]:
folder_prefix = '../data-gen/jumpmean-gauss/jumpmean-gauss'
dataset_number = 1 
f'{folder_prefix}-dataset{dataset_number}-training-data.csv'

'../data-gen/jumpmean-gauss/jumpmean-gauss-dataset1-training-data.csv'