In [8]:
from os.path import join, isfile, isdir
from os import listdir
import pandas as pd
import re

In [2]:
dataset_path = "/home/samuel/PycharmProjects/LiquidNetworks/out/human_activity"

In [3]:
subject_df = pd.read_csv(join(dataset_path, "data_subjects_info.csv"), sep=",")

In [4]:
subject_df

Unnamed: 0,code,weight,height,age,gender
0,1,102,188,46,1
1,2,72,180,28,1
2,3,48,161,28,0
3,4,90,176,31,1
4,5,48,164,23,0
5,6,76,180,28,1
6,7,62,175,30,0
7,8,52,161,24,0
8,9,93,190,32,1
9,10,72,164,31,0


In [42]:
seq_length = 32

regex_activity = re.compile(r"^(\w+)_\d+$")
regex_subject = re.compile(r"^sub_(\d+)\.csv$")

data_path = join(dataset_path, "A_DeviceMotion_data", "A_DeviceMotion_data")

df_list = []

for d in listdir(data_path):
    dir_path = join(data_path, d)
    if isdir(dir_path) and regex_activity.match(d):
        act = regex_activity.match(d).group(1)
        
        for f in listdir(dir_path):
            if isfile(join(dir_path, f)) and regex_subject.match(f):
                subject = regex_subject.match(f).group(1)
                
                sub_df = pd.read_csv(join(dir_path, f), sep=",")
                sub_df["act"] = act
                sub_df["subject"] = subject
                sub_df["file_index"] = len(df_list)
                sub_df = sub_df.iloc[len(sub_df) % seq_length:, :]
                sub_df["time"] = list(range(len(sub_df)))
                
                df_list.append(sub_df)

all_data_df = pd.concat(df_list).drop("Unnamed: 0", axis=1)

In [43]:
len(all_data_df) // 32, len(all_data_df), len(all_data_df) // 32 * 32

(43972, 1407104, 1407104)

In [45]:
all_data_df.columns

Index(['attitude.roll', 'attitude.pitch', 'attitude.yaw', 'gravity.x',
       'gravity.y', 'gravity.z', 'rotationRate.x', 'rotationRate.y',
       'rotationRate.z', 'userAcceleration.x', 'userAcceleration.y',
       'userAcceleration.z', 'act', 'subject', 'file_index', 'time'],
      dtype='object')

In [ ]:
from torch.utils.data import Dataset
from typing import Tuple, List
import torch as th


# MotionSense Dataset: Sensor Based Human Activity and Attribute Recognition
class MotionSenseDataset(Dataset):
    def __init__(self, dataset_path: str, load_train: bool = True) -> None:
        super().__init__()

        train_trials = list(range(1, 10))

        self.__seq_length = 32

        regex_activity = re.compile(r"^(\w+)_(\d+)$")
        regex_subject = re.compile(r"^sub_(\d+)\.csv$")

        data_path = join(dataset_path, "A_DeviceMotion_data", "A_DeviceMotion_data")

        df_list: List[pd.DataFrame] = []

        for d in listdir(data_path):
            dir_path = join(data_path, d)
            matched_dir = regex_activity.match(d)
            if isdir(dir_path) and matched_dir:
                act = matched_dir.group(1)
                trial = matched_dir.group(2)

                for f in listdir(dir_path):
                    matched_file = regex_subject.match(f)
                    if isfile(join(dir_path, f)) and matched_file:
                        subject = matched_file.group(1)

                        sub_df = pd.read_csv(join(dir_path, f), sep=",")
                        sub_df["act"] = act
                        sub_df["trial"] = int(trial)
                        sub_df["subject"] = int(subject)
                        sub_df["file_index"] = len(df_list)
                        sub_df = sub_df.iloc[len(sub_df) % self.__seq_length:, :]
                        sub_df["time"] = list(range(len(sub_df)))

                        df_list.append(sub_df)

        self.__df = pd.concat(df_list).drop("Unnamed: 0", axis=1)

        cond = self.__df["trial"].isin(train_trials)
        self.__df = self.__df[cond if load_train else ~cond]

        self.__features_columns = [
            "attitude.roll", "attitude.pitch", "attitude.yaw", "gravity.x",
            "gravity.y", "gravity.z", "rotationRate.x", "rotationRate.y",
            "rotationRate.z", "userAcceleration.x", "userAcceleration.y",
            "userAcceleration.z",
        ]
        self.__target_column = "act"

        self.__class_to_idx = {
            c: i for i, c in enumerate(sorted(self.__df[self.__target_column].unique()))
        }

    def __len__(self) -> int:
        return len(self.__df) // self.__seq_length

    def __getitem__(self, index: int) -> Tuple[th.Tensor, th.Tensor, th.Tensor]:
        index_start = index * self.__seq_length
        index_end = (index + 1) * self.__seq_length

        sub_df = self.__df.iloc[index_start:index_end]

        features_df = sub_df[self.__features_columns].astype(float).fillna(0)
        target_variable = sub_df[self.__target_column].map(self.__class_to_idx)

        return (
            th.tensor(features_df.to_numpy().T, dtype=th.float),
            th.ones(len(features_df), dtype=th.float),
            th.tensor(target_variable.to_numpy(), dtype=th.long),
        )