In [1]:
import pandas as pd
import numpy as np
import os
import torch
import sklearn.preprocessing

from torch.utils.data import TensorDataset
from torch.nn.utils.rnn import pad_sequence
from tqdm.notebook import tqdm

In [2]:
file = open('MobiAct_Dataset_v2.0/Readme.txt', 'r', encoding='latin1')
strings = file.readlines()
file.close()

In [3]:
person_list, activity_list = [], []
for s in strings:
    if '|' in s:
        temp = s.split('|')
        temp = [x.strip() for x in temp]
        if 'sub' in s and len(temp) == 9:
            person_list.append(temp[3:-1])
        if len(temp) == 8:
            activity_list.append(temp[1:-1])

In [4]:
falls = ['FOL', 'FKL', 'BSC', 'SDL']

columns = ['name', 'age', 'height', 'weight', 'gender']
person_info = pd.DataFrame(person_list, columns=columns)

activity_info = pd.DataFrame(activity_list)
activity_info.columns = activity_info.iloc[0]
activity_info = activity_info.drop([0, 13])
activity_info = activity_info.reset_index(drop=True)
index = activity_info['No.']
activity_info = activity_info.drop(['No.'], axis=1)
activity_info.index = index
activity_info['label_encoded'] = list(range(len(activity_info)))

In [5]:
activity_info.to_csv('mobiact_preprocessed/activity_info.csv')
person_info.to_csv('mobiact_preprocessed/person_info.csv')

In [6]:
data_dir = 'MobiAct_Dataset_v2.0/Annotated Data/'
act_list = [x for x in os.listdir(data_dir) if not x.startswith('.')]
print(act_list)

['CHU', 'FOL', 'CSI', 'JOG', 'CSO', 'SBE', 'WAL', 'SBW', 'SLW', 'SIT', 'SCH', 'FKL', 'SRH', 'STD', 'BSC', 'JUM', 'SDL', 'SLH', 'STN', 'STU']


In [12]:
save_dir = 'mobiact_preprocessed/'
train_dir, valid_dir, test_dir = 'train/', 'valid/', 'test/'

for dir_name in [train_dir, valid_dir, test_dir]:
    if not os.path.exists(save_dir + dir_name):
        os.makedirs(save_dir + dir_name)
for act in act_list:
    file_dir = data_dir + act + '/'
    file_list = os.listdir(file_dir)
    for file in file_list:
        person_num = int(file.split('_')[1])
        if person_num in [28, 34, 57]:
            continue
        temp = pd.read_csv(data_dir + act + '/' + file)
        temp['age'] = person_info.iloc[person_num - 1]['age']
        temp['height'] = person_info.iloc[person_num - 1]['height']
        temp['weight'] = person_info.iloc[person_num - 1]['weight']
        temp['gender'] = person_info.iloc[person_num - 1]['gender']
        temp['person_id'] = person_num
        if person_num <= 50:
            temp.to_csv(save_dir + train_dir + file)
        elif person_num > 50 and person_num < 57:
            temp.to_csv(save_dir + valid_dir + file)
        elif person_num >= 57:
            temp.to_csv(save_dir + test_dir + file)

In [10]:
columns = ['person_id', 'acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z', 'height', 'weight', 'gender', 'age']
temp[columns]

Unnamed: 0,person_id,acc_x,acc_y,acc_z,gyro_x,gyro_y,gyro_z,height,weight,gender,age
0,36,-0.328140,9.830861,-0.816020,0.017715,-0.046426,-0.010079,164,62,F,22
1,36,-0.323301,9.821182,-0.830538,0.020159,-0.031765,-0.005192,164,62,F,22
2,36,-0.318456,9.811493,-0.845071,0.014355,-0.035736,-0.006414,164,62,F,22
3,36,-0.323218,9.818622,-0.854730,0.013439,-0.041233,-0.008858,164,62,F,22
4,36,-0.337355,9.842184,-0.859443,0.003360,-0.029627,-0.013439,164,62,F,22
...,...,...,...,...,...,...,...,...,...,...,...
1926,36,-0.096924,9.860768,-0.316613,-0.041539,-0.005192,0.025351,164,62,F,22
1927,36,-0.114922,9.883083,-0.325067,-0.026878,-0.004276,0.020769,164,62,F,22
1928,36,-0.114922,9.880720,-0.317978,-0.041233,-0.013744,0.021380,164,62,F,22
1929,36,-0.114922,9.878284,-0.310669,-0.037874,-0.015577,0.024740,164,62,F,22


In [None]:
falls = ['FOL', 'FKL', 'BSC', 'SDL']
max_length = 0

for dir_name in [train_dir, valid_dir, test_dir]:
    file_dir = save_dir + dir_name
    file_list = os.listdir(file_dir)
    for file in file_list:
        if file.split('_')[0] in falls:
            temp = pd.read_csv(file_dir + file)
            if len(temp) > max_length:
                max_length = len(temp)
print(max_length)

In [None]:
onehotenc = sklearn.preprocessing.OneHotEncoder()
orinalenc = sklearn.preprocessing.OrdinalEncoder()

orinalenc.fit_transform([['M'], ['F']])

In [None]:
set(temp['gender'])

In [None]:
onehotenc.fit([['M'], ['F']])
onehotenc.transform([['M'], ['F']]).toarray()

In [None]:
columns = ['person_id', 'acc_x', 'acc_y', 'acc_z', 'gyro_x', 'gyro_y', 'gyro_z', 'height', 'weight', 'gender', 'age']
label_encoder = sklearn.preprocessing.LabelEncoder()
ordinal_encoder = sklearn.preprocessing.OrdinalEncoder()
ordinal_encoder.fit([['M'], ['F']])

save_name = ['train.pt', 'valid.pt', 'test.pt']
for idx, dir_name in enumerate([train_dir, valid_dir, test_dir]):
    X, y = [], []
    file_dir = save_dir + dir_name
    file_list = os.listdir(file_dir)
    for file in file_list:
        label = file.split('_')[0]
        temp = pd.read_csv(file_dir + file)
        if len(temp) > max_length:
            temp = temp.iloc[:max_length]
        temp['gender'] = ordinal_encoder.transform(temp['gender'].values.reshape(-1, 1))
        val = torch.tensor(temp[columns].values)
        X.append(val)
        y.append(label)
    X = pad_sequence(X)
    X = torch.permute(X, (1, 2, 0)).contiguous()
    if idx == 0:
        y = label_encoder.fit_transform(y)
    else:
        y = label_encoder.transform(y)
    dataset = TensorDataset(X, torch.tensor(y))
    torch.save(dataset, save_dir + save_name[idx])

In [None]:
ordinal_encoder.categories_

In [None]:
train = torch.load(save_dir + 'train.pt')
train.tensors[0][0][-2]