In [1]:
# process data wisdm
# download dataset from: http://www.cis.fordham.edu/wisdm/dataset.php
# unzip file to data_dir


import os
import numpy as np
import pickle
import pandas as pd
from sklearn.model_selection import train_test_split

data_dir = './wisdm/WISDM_ar_v1.1'
data_path = os.path.join(data_dir, 'WISDM_ar_v1.1_raw.txt')

In [3]:
# to dataframe
with open(data_path, 'r') as f:
    raw_data = f.read()
    lines = raw_data.split('\n')

In [4]:
class_labels = [
    "Walking", "Jogging", "Sitting", "Standing", "Upstairs", "Downstairs",
]

datas = []
labels = []
users = []
time_stamps = []

for line in lines:
    parts = line.strip().replace(';', '').split(',')
    if len(parts) != 6 and len(parts) != 7:
        continue
    if parts[3] == '' or parts[4] == '' or parts[5] == '':
        continue
        
    users.append(int(parts[0]))
    datas.append([float(parts[3]), float(parts[4]), float(parts[5])])
    time_stamps.append(int(parts[2]))
    labels.append(class_labels.index(parts[1]))  # name --> class index

In [5]:
data_df = pd.DataFrame()
data_df['user'] = users
data_df['data'] = datas
data_df['label'] = labels
data_df['time_stamp'] = time_stamps

In [None]:
# save dataframe
# data_df.to_pickle('./wisdm/data_df.pkl')

In [6]:
def make_data(window_size=128, step=0, save_path='./wisdm/dataset', data_df=None):
    if data_df is None:
        with open('./wisdm/data_df.pkl', 'rb') as fo:
            data_df = pickle.load(fo)
    step = step if step < window_size else window_size - 1
    user_list = data_df['user'].drop_duplicates().values
    # input()
    for user in user_list:
        x = data_df[data_df['user'] == user]['data'].values
        y = data_df[data_df['user'] == user]['label'].values
        x = np.stack(x).astype(np.float32)  # (n, ) --> (n, 3); list to numpy
        sample_idx = []
        i = 0
        while i < len(y) - window_size:
            flag, next_idx = check_label(y, i, window_size)
            if flag:
                sample_idx.append([_ for _ in range(i, i + window_size)])
                i += step
            else:
                i += next_idx

        labels = []
        datas = []
        for idx in sample_idx:
            labels.append(y[idx[0]])
            datas.append(x[idx])
        labels = np.array(labels)
        datas = np.array(datas)

        print(f'user:{user}, data num:{labels.shape[0]}')

        # split train and test
        train_datas, test_datas, train_labels, test_lables = train_test_split(datas, labels, train_size=0.6, shuffle=True)
        # save
        save_path = os.path.join(save_path)
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        with open(os.path.join(save_path, f'{user}_train_labels.pkl'), 'wb') as fo:
            pickle.dump(train_labels, fo)
        with open(os.path.join(save_path, f'{user}_train_datas.pkl'), 'wb') as fo:
            pickle.dump(train_datas, fo)
        with open(os.path.join(save_path, f'{user}_test_labels.pkl'), 'wb') as fo:
            pickle.dump(test_lables, fo)
        with open(os.path.join(save_path, f'{user}_test_datas.pkl'), 'wb') as fo:
            pickle.dump(test_datas, fo)


def check_label(y, i, window_size):
    first_label = y[i]
    next_idx = 0
    flag = True
    for idx in range(window_size):
        if y[idx+i] != first_label:
            flag = False
            first_label = y[idx+i]
            next_idx = idx

    return flag, next_idx

In [8]:
make_data(128, step=2, save_path='./wisdm/dataset/', data_df=data_df)