In [1]:
# this is a classic usage for AWS

import sys
import os
ROOT = os.path.join('/home', 'ubuntu', 'concurrent-activity-recognition')
os.chdir(ROOT)
print(os.getcwd())

/home/ubuntu/concurrent-activity-recognition


In [2]:
import torch
import pandas as pd
import numpy as np
import glob

if torch.cuda.is_available():
    device = torch.device("cuda:0")  # you can continue going on here, like cuda:1 cuda:2....etc. 
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")

Running on the GPU


In [36]:
# data column constant
USE_FEATURES = "few"
DATASET_CLEANED_DIR = "dataset_cleaned_few_features"
DATASET_PARSED_DIR = "dataset_parsed_few_features_label0_label1"
TARGET_LABEL = "LL_Right_Arm"
DATA_TYPES = ["train", "test"]

if USE_FEATURES == "few":
    # This is the list of features that are retrievable from
    # fitbit and phone
    col_right_wrist = [22, 23, 24]
    col_right_wrist_name = ["right_wrist_x", "right_wrist_y", "right_wrist_z"]
    col_left_wrist = [31, 32, 33]
    col_left_wrist_name = ["left_wrist_x", "left_wrist_y", "left_wrist_z"]
    col_right_hand = [34, 35, 36]
    col_right_hand_name = ["right_hand_x", "right_hand_y", "right_hand_z"]
    col_left_hand = [13, 14, 15]
    col_left_hand_name = ["left_hand_x", "left_hand_y", "left_hand_z"]
    col_hip = [4, 5, 6]
    col_hip_name = ["hip_x", "hip_y", "hip_z"]
    col_label = [243, 247, 249]
    col_label_name = ["Locomotion", "LL_Right_Arm", "ML_Both_Arms"]

    col_all = []
    for col in [col_hip, col_left_hand, col_right_wrist, col_left_wrist,
                col_right_hand, col_label]:
        col_all.extend(col)

    col_all_name = []
    for col in [col_hip_name, col_left_hand_name, col_right_wrist_name,
                col_left_wrist_name, col_right_hand_name, col_label_name]:
        col_all_name.extend(col)

    col_feature_name = []
    for col in [col_hip_name, col_left_hand_name, col_right_wrist_name,
                col_left_wrist_name, col_right_hand_name]:
        col_feature_name.extend(col)
elif USE_FEATURES == "all":
    # This is the list of all accelerometer data from sensors placed on body
    col_all = [i for i in range(1, 37)]
    col_all.extend([243, 247, 249])

    feature_list = ["RKNUp", "Hip", "LUAUp", "RUADn", "LH", "Back", "RKNDn",
                    "RWR", "RUAUp", "LUADn", "LWR", "RH"]
    axis_list = ["X", "Y", "Z"]
    col_feature_name = [f + "_" + a
                        for f in feature_list
                        for a in axis_list]

    col_all_name = col_feature_name.copy()
    col_all_name.extend(["Locomotion", "LL_Right_Arm", "ML_Both_Arms"])
else:
    raise Exception("USE_FEATURES is not recognized")

In [25]:
# clean the data, both train data and test data

# normalizing function
def normalize(ser):
    mean = ser.mean()
    std = ser.std()
    return (ser - mean)/std

for data_type in DATA_TYPES:
    data_files = glob.glob("dataset/" + data_type + "/*.dat")
    for filename in data_files:
        print(filename)
        data = pd.read_table(filename, sep = "\s+", header = None)

        # clean the data
        df = data[col_all].copy()
        del data
        df.columns = col_all_name

        #df.dropna(inplace=True)
        means = np.nanmean(df, axis=0)

        i = 0
        for column in df.columns.values:
            df[column] = df[column].fillna(means[i])
            i += 1

        df = df[(df["Locomotion"] != 0) &
                (df["LL_Right_Arm"] != 0) &
                (df["ML_Both_Arms"] != 0)]

        # remap the output as 0-indexing to make learning possible
        df["Locomotion"] = df["Locomotion"].map({1: 0, 2: 1, 4: 2, 5: 3})
        df["LL_Right_Arm"] = df["LL_Right_Arm"].map(
            {401: 0, 402: 1, 403: 2, 404: 3, 405: 4, 406: 5, 407: 6, 408: 7,
            409: 8, 410: 9, 411: 10, 412: 11, 413: 12})
        df["ML_Both_Arms"] = df["ML_Both_Arms"].map(
            {406516: 0, 406517: 1, 404516: 2, 404517: 3, 406520: 4, 404520: 5,
            406505: 6, 404505: 7, 406519: 8, 404519: 9, 406511: 10, 404511: 11,
            406508: 12, 404508: 13, 408512: 14, 407521: 15, 405506: 16})
        
        # apply normalization function
        df[col_feature_name] = df[col_feature_name].apply(normalize, axis=0)
        
        file = filename.split("/")[2][:-4]
        df.to_csv(DATASET_CLEANED_DIR + "/" + data_type + "/" + file + ".csv", index=False)

dataset/train/S3-Drill.dat
dataset/train/S3-ADL3.dat
dataset/train/S3-ADL4.dat
dataset/train/S3-ADL2.dat
dataset/train/S3-ADL1.dat
dataset/test/S3-ADL5.dat


In [37]:
# parse the data
FILE_LENGTH = 30

for data_type in DATA_TYPES:
    data_files = glob.glob(DATASET_CLEANED_DIR + "/" + data_type + "/*.csv")
    for filename in data_files:
        print(filename)
        i = 0
        file = filename.split("/")[2][:-4]
        df = pd.read_csv(filename, header=0)

#         # cut the data for every specified length
#         df = df.assign(temp=df[TARGET_LABEL].diff().ne(0).cumsum())
        
        # cut data for multiple labels
        df = df.assign(label01 = (df["Locomotion"].astype(str) + df["LL_Right_Arm"].astype(str)).astype(int))
        df = df.assign(temp = df["label01"].diff().ne(0).cumsum())
        df.drop("label01", axis=1, inplace=True)
        
        for _, sub_df in df.groupby("temp"):
            sub_df.drop(columns=["temp"], inplace=True)
            sub_df.reset_index(drop=True, inplace=True)

            while len(sub_df) > FILE_LENGTH:
                sub_df_cut = sub_df.loc[:FILE_LENGTH-1].copy()
                sub_df_cut.to_csv(DATASET_PARSED_DIR + "/" + data_type + "/" +
                                    file + "-" + str(i) + ".csv", index=False)
                sub_df = sub_df.loc[FILE_LENGTH:]
                sub_df.reset_index(drop=True, inplace=True)
                i += 1

            # pad files that are less than specified length
            padding = np.zeros([FILE_LENGTH-len(sub_df), len(sub_df.columns.values)])
            padding_df = pd.DataFrame(padding, columns=col_all_name)
            sub_df = padding_df.append(sub_df)

            sub_df.to_csv(DATASET_PARSED_DIR + "/" + data_type + "/" + file + "-" +
                        str(i) + ".csv", index=False)

            i += 1

dataset_cleaned_few_features/train/S3-Drill.csv
dataset_cleaned_few_features/train/S3-ADL3.csv
dataset_cleaned_few_features/train/S3-ADL4.csv
dataset_cleaned_few_features/train/S3-ADL1.csv
dataset_cleaned_few_features/train/S3-ADL2.csv
dataset_cleaned_few_features/test/S3-ADL5.csv


In [38]:
# check the average length of each partition
# check for possible NaN files
# check for target distribution
file_lengths = []
targets = []

for data_type in DATA_TYPES:
    data_files = glob.glob(DATASET_PARSED_DIR + "/" + data_type + "/*.csv")
    for filename in data_files:
        df = pd.read_csv(filename, header=0)
        file_length = len(df)
        file_lengths.append(file_length)
        targets.append(int(df.loc[FILE_LENGTH-1, TARGET_LABEL]))
        array_sum = np.sum(df.to_numpy())
        if np.isnan(array_sum):
            print("NaN files: " + filename)

print("Min length: " + str(np.min(file_lengths)))
print("Max length: " + str(np.max(file_lengths)))

Min length: 30
Max length: 30


In [12]:
data_type

'test'

In [39]:
def CountFrequency(my_list): 
    # Creating an empty dictionary  
    freq = {} 
    for item in my_list: 
        if (item in freq): 
            freq[item] += 1
        else: 
            freq[item] = 1
  
    for key, value in freq.items(): 
        print ("% d : % d"%(key, value)) 

CountFrequency(targets)

 4 :  589
 12 :  475
 7 :  118
 2 :  65
 5 :  347
 11 :  462
 3 :  346
 6 :  150
 0 :  57


In [40]:
len(targets)

2609