<a href="https://colab.research.google.com/github/Ghifarahadian/concurrent-activity-recognition/blob/master/datacleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# this is a classic usage for google colab

import sys
import os.path as osp
import os
import gc
from google.colab import drive
drive.mount('/content/drive')
ROOT = osp.join('/content', 'drive', 'My Drive', 'concurrent-activity-recognition')
os.chdir(ROOT)

In [None]:
import torch
import pandas as pd
import numpy as np
import glob

if torch.cuda.is_available():
    device = torch.device("cuda:0")  # you can continue going on here, like cuda:1 cuda:2....etc. 
    print("Running on the GPU")
else:
    device = torch.device("cpu")
    print("Running on the CPU")

In [None]:
# data column constant

col_right_wrist = [22, 23, 24]
col_right_wrist_name = ["right_wrist_x", "right_wrist_y", "right_wrist_z"]
col_left_wrist = [31, 32, 33]
col_left_wrist_name = ["left_wrist_x", "left_wrist_y", "left_wrist_z"]
col_right_hand = [34, 35, 36]
col_right_hand_name = ["right_hand_x", "right_hand_y", "right_hand_z"]
col_left_hand = [13, 14, 15]
col_left_hand_name = ["left_hand_x", "left_hand_y", "left_hand_z"]
col_hip = [4, 5, 6]
col_hip_name = ["hip_x", "hip_y", "hip_z"]
col_label = [243, 247, 249]
col_label_name = ["Locomotion", "LL_Right_Arm", "ML_Both_Arms"]

col_all = []
for col in [col_hip, col_left_hand, col_right_wrist, col_left_wrist,
            col_right_hand, col_label]:
    col_all.extend(col)

col_all_name = []
for col in [col_hip_name, col_left_hand_name, col_right_wrist_name,
            col_left_wrist_name, col_right_hand_name, col_label_name]:
    col_all_name.extend(col)

col_feature_name = []
for col in [col_hip_name, col_left_hand_name, col_right_wrist_name,
            col_left_wrist_name, col_right_hand_name]:
    col_feature_name.extend(col)

In [None]:
# normalizing function

def normalize(ser):
    mean = ser.mean()
    std = ser.std()
    return (ser - mean)/std

In [None]:
# clean the data, both train data and test data
data_types = ["train", "test"]

for data_type in data_types:
    data_files = glob.glob("dataset/" + data_type + "/*.dat")
    for filename in data_files:
        print(filename)
        data = pd.read_table(filename, sep = "\s+", header = None)

        # clean the data
        df = data[col_all].copy()
        del data
        gc.collect()
        df.columns = col_all_name

        df.dropna(inplace=True)
        df = df[(df["Locomotion"] != 0) &
                (df["LL_Right_Arm"] != 0) &
                (df["ML_Both_Arms"] != 0)]

        # remap the output as 0-indexing to make learning possible
        df["Locomotion"] = df["Locomotion"].map({1: 0, 2: 1, 4: 2, 5: 3})
        df["LL_Right_Arm"] = df["LL_Right_Arm"].map(
            {401: 0, 402: 1, 403: 2, 404: 3, 405: 4, 406: 5, 407: 6, 408: 7,
            409: 8, 410: 9, 411: 10, 412: 11, 413: 12})
        df["ML_Both_Arms"] = df["ML_Both_Arms"].map(
            {406516: 0, 406517: 1, 404516: 2, 404517: 3, 406520: 4, 404520: 5,
            406505: 6, 404505: 7, 406519: 8, 404519: 9, 406511: 10, 404511: 11,
            406508: 12, 404508: 13, 408512: 14, 407521: 15, 405506: 16})
        
        #df[col_feature_name] = df[col_feature_name].apply(normalize, axis=1)
        
        file = filename.split("/")[2][:-4]
        df.to_csv("dataset_cleaned/" + data_type + "/" + file + ".csv", index=False)

In [None]:
# parse the data
data_types = ["train", "test"]

for data_type in data_types:
    data_files = glob.glob("dataset_cleaned/" + data_type + "/*.csv")
    for filename in data_files:
        i = 0
        file = filename.split("/")[2][:-4]
        print(filename)
        print(file)
        df = pd.read_csv(filename, header=0)

        df = df.assign(temp=df["Locomotion"].diff().ne(0).cumsum())

        for _, sub_df in df.groupby("temp"):
            sub_df.drop(columns=["temp"], inplace=True)
            sub_df.reset_index(drop=True, inplace=True)

            while len(sub_df) > 30:
                sub_df_cut = sub_df.loc[0:29].copy()
                #print(sub_df_cut)
                sub_df_cut[col_feature_name] = sub_df_cut[col_feature_name].apply(normalize, axis=0)
                #print(sub_df_cut)
                #break
                sub_df_cut.to_csv("dataset_parsed/" + data_type + "/" +
                                    file + "-" + str(i) + ".csv", index=False)
                sub_df = sub_df.loc[30:]
                sub_df.reset_index(drop=True, inplace=True)
                i += 1
            #break
            # normalize first, pad later
            sub_df[col_feature_name] = sub_df[col_feature_name].apply(normalize, axis=0)

            # pad here
            padding = np.zeros([30-len(sub_df), 18])
            padding_df = pd.DataFrame(padding, columns=col_all_name)
            sub_df = padding_df.append(sub_df)

            sub_df.to_csv("dataset_parsed/" + data_type + "/" + file + "-" +
                        str(i) + ".csv", index=False)

            i += 1

In [None]:
temp = pd.DataFrame([[1, 2], [3, 4]])

In [None]:
temp2 = temp.apply(normalize, axis=0)

In [None]:
temp2

In [None]:
test = np.zeros([100-len(sub_df), 18])

In [None]:
test_df = pd.DataFrame(test, columns=col_all_name)

In [None]:
sub_df

In [None]:
sub_df = test_df.append(sub_df)

In [None]:
drop_index = [i for i in range(0, 10)]

df_new1 = sub_df.loc[:80, :]

In [None]:
sub_df.reset_index(drop=True, inplace=True)

In [None]:
sub_df = sub_df[10:]

In [None]:
sub_df

In [None]:
# check the average length of each partition

data_types = ["train", "test"]

targets = []

for data_type in data_types:
    data_files = glob.glob("dataset_parsed/" + data_type + "/*.csv")
    for filename in data_files:
        print(filename)
        target = pd.read_csv(filename, header=0).iloc[29, 15]
        targets.append(target)

In [None]:
targets

In [None]:
from itertools import groupby
[len(list(group)) for key, group in groupby(targets)]

In [None]:
# check the average length of each partition

data_types = ["train", "test"]

file_lengths = []

for data_type in data_types:
    data_files = glob.glob("dataset_parsed/" + data_type + "/*.csv")
    for filename in data_files:
        print(filename)
        file_length = len(pd.read_csv(filename, header=0))
        file_lengths.append(file_length)

In [None]:
np.sum(file_lengths)

In [None]:
len(file_lengths)

In [None]:
temp = pd.read_csv("dataset_parsed/train/S3-ADL1-8.csv")

In [None]:
temp.loc[99:100]

In [None]:
p31 = np.asarray(file_lengths)
(p31 < 100).sum()

In [None]:
np.min(file_lengths)

In [None]:
import seaborn as sns

In [None]:
sns.displot(file_lengths)

In [None]:
!pip install --upgrade seaborn