In [1]:
import os
import numpy as np

from hmog import HmogHelper
from tqdm import tqdm

dataset_path = 'hmog_public_dataset/'

subjects = HmogHelper.list_folder(dataset_path)

session_maps = {}

for subject in subjects:
    ww_subject_sessions = HmogHelper.filter_user_sessions_by_type(os.path.join(dataset_path, subject), HmogHelper.WW_SESSION_TYPES)
    ws_subject_sessions = HmogHelper.filter_user_sessions_by_type(os.path.join(dataset_path, subject), HmogHelper.WS_SESSION_TYPES)

    session_maps[subject] = {'w': [],
                             's': []}
    for session in ww_subject_sessions:
        session_maps[subject]['w'].append(HmogHelper.read_person_session(os.path.join(dataset_path, subject, session)))
        HmogHelper.preprocess_session_data(session_maps[subject]['w'][-1])
    for session in ws_subject_sessions:
        session_maps[subject]['s'].append(HmogHelper.read_person_session(os.path.join(dataset_path, subject, session)))
        HmogHelper.preprocess_session_data(session_maps[subject]['s'][-1])

In [2]:
session_hmog_matrices = {}

excluded_subjects = []

for subject in tqdm(session_maps):
    sessions_w = session_maps[subject]['w']
    sessions_s = session_maps[subject]['s']

    assert len(sessions_w) > 3
    assert len(sessions_s) > 3

    success_w1, w1 = HmogHelper.extract_hmog_features(sessions_w[0], HmogHelper.SENSOR_LIST, HmogHelper.DIMS_LIST)
    success_w2, w2 = HmogHelper.extract_hmog_features(sessions_w[1], HmogHelper.SENSOR_LIST, HmogHelper.DIMS_LIST)
    success_w3, w3 = HmogHelper.extract_hmog_features(sessions_w[2], HmogHelper.SENSOR_LIST, HmogHelper.DIMS_LIST)
    success_w4, w4 = HmogHelper.extract_hmog_features(sessions_w[3], HmogHelper.SENSOR_LIST, HmogHelper.DIMS_LIST)

    success_s1, s1 = HmogHelper.extract_hmog_features(sessions_s[0], HmogHelper.SENSOR_LIST, HmogHelper.DIMS_LIST)
    success_s2, s2 = HmogHelper.extract_hmog_features(sessions_s[1], HmogHelper.SENSOR_LIST, HmogHelper.DIMS_LIST)
    success_s3, s3 = HmogHelper.extract_hmog_features(sessions_s[2], HmogHelper.SENSOR_LIST, HmogHelper.DIMS_LIST)
    success_s4, s4 = HmogHelper.extract_hmog_features(sessions_s[3], HmogHelper.SENSOR_LIST, HmogHelper.DIMS_LIST)

    extract_success = success_w1 and success_w2 and success_w3 and success_w4 and success_s1 and success_s2 and success_s3 and success_s4

    if extract_success:
        session_hmog_matrices[subject] = {'w': [],
                                          's': []}

        session_hmog_matrices[subject]['w'] = []
        session_hmog_matrices[subject]['w'].append(w1)
        session_hmog_matrices[subject]['w'].append(w2)
        session_hmog_matrices[subject]['w'].append(w3)
        session_hmog_matrices[subject]['w'].append(w4)

        session_hmog_matrices[subject]['s'] = []
        session_hmog_matrices[subject]['s'].append(s1)
        session_hmog_matrices[subject]['s'].append(s2)
        session_hmog_matrices[subject]['s'].append(s3)
        session_hmog_matrices[subject]['s'].append(s4)
    else:
        excluded_subjects.append(subject)

print(f'excluded subjects: {excluded_subjects}')
# std_deviations, session_hmog_vector = calc_std_and_scale(session_hmog_vector)

100%|██████████| 100/100 [1:29:41<00:00, 53.82s/it]

excluded subjects: ['733162']





In [3]:
import pickle

pickled_ds_filename = 'hmog_dataset_ww_ws'

if not os.path.exists(pickled_ds_filename):
    hmog_dataset_file = open(pickled_ds_filename, 'wb')
    pickle.dump(session_hmog_matrices, hmog_dataset_file)
    hmog_dataset_file.close()

# Single Dataframe

Let's also make a single dataframe with all sessions and users

In [1]:
import pickle

test_file = open('preprocessed_data/hmog_dataset_ww_ws', 'rb')
session_hmog_matrices = pickle.load(test_file)
test_file.close()

In [2]:
# We need to exclude subjects who used landscape orientation
landscape_subjects = ['856302', '751131', '220962', '186676', '980953', '201848', '675397']
for l_s in landscape_subjects:
    del session_hmog_matrices[l_s]

subjects_list = list(session_hmog_matrices.keys())
print(f'len: {len(subjects_list)}')

len: 92


In [38]:
import pandas as pd

session_key_to_int = {'s': 0, 'w': 1}

header = ['timestamp'] + list(range(1, 65))

transformed_df = None

for subject_key in session_hmog_matrices.keys():
    # 's', 'w'
    for session_type in session_hmog_matrices[subject_key].keys():
        for session_index in range(len(session_hmog_matrices[subject_key][session_type])):

            session = session_hmog_matrices[subject_key][session_type][session_index]
            session_df = pd.DataFrame(session,
                                      columns=header)
            # print(curr_session_df[1] == session[:, 1])
            
            session_df['user_id'] = subject_key
            session_df['session_type'] = session_key_to_int[session_type]
            session_df['session_index'] = session_index
            
            if transformed_df is None:
                transformed_df = session_df.copy()
            else:
                transformed_df = pd.concat([transformed_df, session_df.copy()], ignore_index=True)

print(transformed_df.shape)

(521872, 68)


In [39]:
transformed_df.to_csv('preprocessed_data/hmog_dataset_unified_df_ids_types_indices.csv')

## Windowed Split

It is also a good idea to apply sliding window in advance

In [6]:
import pandas as pd

df = pd.read_csv('preprocessed_data/hmog_dataset_unified_df_ids_types_indices.csv', index_col=0)
df['timestamp'] = pd.to_datetime(df['timestamp'], unit='ms')

In [8]:
def get_windowed_hmog_df(df, window_size_seconds):
    df_grouped = df.groupby(by=['user_id', 'session_type', 'session_index']).rolling(f'{window_size_seconds}s', on='timestamp', min_periods=8).mean().dropna().reset_index()
    df_grouped.drop(columns=['level_3'], inplace=True)
    
    return df_grouped

In [11]:
df_20s = get_windowed_hmog_df(df, window_size_seconds=20)
df_40s = get_windowed_hmog_df(df, window_size_seconds=40)
df_60s = get_windowed_hmog_df(df, window_size_seconds=60)
df_80s = get_windowed_hmog_df(df, window_size_seconds=80)
df_100s = get_windowed_hmog_df(df, window_size_seconds=100)
df_120s = get_windowed_hmog_df(df, window_size_seconds=120)
df_140s = get_windowed_hmog_df(df, window_size_seconds=140)

In [19]:
df_20s.to_csv('preprocessed_data/hmog_dataset_unified_df_20s.csv')
df_40s.to_csv('preprocessed_data/hmog_dataset_unified_df_40s.csv')
df_60s.to_csv('preprocessed_data/hmog_dataset_unified_df_60s.csv')
df_80s.to_csv('preprocessed_data/hmog_dataset_unified_df_80s.csv')
df_100s.to_csv('preprocessed_data/hmog_dataset_unified_df_100s.csv')
df_120s.to_csv('preprocessed_data/hmog_dataset_unified_df_120s.csv')
df_140s.to_csv('preprocessed_data/hmog_dataset_unified_df_140s.csv')