# Imports

In [1]:
import numpy as np
import os
import pandas as pd
import pickle

from copy import deepcopy
from sklearn.model_selection import train_test_split

# Load Data

Raw Data: http://archive.ics.uci.edu/ml/datasets/smartphone-based+recognition+of+human+activities+and+postural+transitions#

# Functions

In [2]:
def get_files_list(url, view=False):
    c = 0
    heads = set()
    n = None
    for (roots, dirs, files) in os.walk(url):
        if roots != url:
            continue
        
        for f in sorted(files):
            x = f.split(".")[0]
            x = x.split("_")

            h = "_".join(x[:-4])
            heads.add(h)
            n = x[-4]
            c += 1
    if view:
        print(c)
    return heads, n

In [5]:
def load_files(url, view=False):
    heads, n = get_files_list(url, view)
    split_heads = set()
    for h in heads:
        h = h.split("_")
        if len(h) > 1:
            h = "_".join(h[:-1])
            split_heads.add(h)
        else:
            split_heads.add("")

    df_master = None
    for h in list(split_heads):
        shapes = []
        if h != "":
            h = h + "_"
        for i in range(1, 5):
            file = "_%s_v1_uc_%d.csv"%(n, i)
            data = pd.read_csv(url + "%sdata%s"%(h, file), header=None)
            labels = pd.read_csv(url + "%slabels%s"%(h, file), header=None)
            df = pd.concat([data, labels], axis=1)
            shapes.append(df.shape)

            if df_master is None:
                df_master = deepcopy(df)
            else:
                df_master = pd.concat([df_master, df], axis=0)
        samples = 0
        for i in range(1, 5):
            samples += shapes[-i][1]
        if view:
            print(h, samples)     
        
    k = "label"
    cols = list(df_master.columns)[:-1] + [k]
    df_master.columns = cols
    u = sorted(list(df_master[k].unique()))
    print("Activity Shapes:")
    for a in u:
        df = df_master.loc[df_master[k] == a]
        print(a, df.shape)   

    print("DataFrame Shape:", df_master.shape)

    return df_master

# url = "/content/drive/MyDrive/Classes/CSCE 5280 AI for Wearables/Group Project 2/Datasets/HAPT_UC_k_means/"
# df = load_files(url, view=False)

# View Activity and Total Shapes per Dataset

In [6]:
load = True

## HAPT

In [None]:
url = "/content/drive/MyDrive/Classes/CSCE 5280 AI for Wearables/Group Project 2/Datasets/HAPT_UC_k_means/"
if load:
    df = load_files(url, view=False)
    print(df.isnull().values.ravel().sum())
    df

## UCI

In [7]:
url = "/content/drive/MyDrive/Classes/CSCE 5280 AI for Wearables/Group Project 2/Datasets/UCI_UC_k_means/"
if load:
    df = load_files(url, view=False)
    print(df.isnull().values.ravel().sum())
    df

Activity Shapes:
1 (3444, 562)
2 (3088, 562)
3 (2812, 562)
4 (3554, 562)
5 (3812, 562)
6 (3888, 562)
DataFrame Shape: (20598, 562)
0


## unimib

In [None]:
url = "/content/drive/MyDrive/Classes/CSCE 5280 AI for Wearables/Group Project 2/Datasets/unimib_UC_k_means/"
if load:
    df = load_files(url, view=False)
    print(df.isnull().values.ravel().sum())
    df

## wHAR

In [None]:
url = "/content/drive/MyDrive/Classes/CSCE 5280 AI for Wearables/Group Project 2/Datasets/wHAR_UC_k_means/"
if load:
    df = load_files(url, view=False)
    print(df.isnull().values.ravel().sum())
    df

## WISDM

In [None]:
load = True

In [None]:
190/5

38.0

In [None]:
url = "/content/drive/MyDrive/Classes/CSCE 5280 AI for Wearables/Group Project 2/Datasets/WISDM_UC_k_means/"
if load:
    df = load_files(url, view=False)
    print(df.isnull().values.ravel().sum())
    df

Activity Shapes:
1 (456, 406)
2 (1557, 406)
3 (571, 406)
4 (1728, 406)
5 (264, 406)
6 (198, 406)
DataFrame Shape: (4774, 406)
0


# Preprocess Data

Choose the UCI dataset as the source due to size.

Choose the - dataset as the target due to -.

## Global Variables

Note: train/val/test split is 80/10/10

In [None]:
# File handling
data_dir = "/content/drive/MyDrive/Classes/CSCE 5280 AI for Wearables/Group Project 2/Datasets/"    # Mica
source_dir = data_dir + "UCI_UC_k_means/"
target_dir = data_dir + "WISDM_UC_k_means/"

## Load Data

In [None]:
df_s = load_files(source_dir, view=False)
df_t = load_files(target_dir, view=False)

Activity Shapes:
1 (3444, 562)
2 (3088, 562)
3 (2812, 562)
4 (3554, 562)
5 (3812, 562)
6 (3888, 562)
DataFrame Shape: (20598, 562)
Activity Shapes:
1 (456, 406)
2 (1557, 406)
3 (571, 406)
4 (1728, 406)
5 (264, 406)
6 (198, 406)
DataFrame Shape: (4774, 406)


## Preprocess

In [None]:
t_img_rows = 27
t_img_cols = 15
for i in range(1, 406):
    if 405 % i == 0:
        print(i, 405/i)

1 405.0
3 135.0
5 81.0
9 45.0
15 27.0
27 15.0
45 9.0
81 5.0
135 3.0
405 1.0


In [None]:
s_img_rows = 33
s_img_cols = 17
for i in range(1, 562):
    if 561 % i == 0:
        print(i, 561/i)

1 561.0
3 187.0
11 51.0
17 33.0
33 17.0
51 11.0
187 3.0
561 1.0


In [None]:
# Split data/label
def split(df, img_rows, img_cols):

    # Split x and y
    cols = list(df.columns)
    cols.remove("label")
    x = df[cols]
    y = df[["label"]]

    # Train/val/test split
    train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=42)
    val_x, test_x, val_y, test_y = train_test_split(test_x, test_y, test_size=0.5, random_state=42)

    # Reshape x
    train_x_shaped = train_x.to_numpy(copy=True)
    train_x_shaped = np.reshape(train_x_shaped, (train_x_shaped.shape[0], img_rows, img_cols))
    val_x_shaped = val_x.to_numpy(copy=True)
    val_x_shaped = np.reshape(val_x_shaped, (val_x_shaped.shape[0], img_rows, img_cols))
    test_x_shaped = test_x.to_numpy(copy=True)
    test_x_shaped = np.reshape(test_x_shaped, (test_x_shaped.shape[0], img_rows, img_cols))

    d = {"train": {"x": train_x, 
                   "x_shaped": train_x_shaped, 
                   "y": train_y
                   }, 
         "val": {"x": val_x, 
                 "x_shaped": val_x_shaped, 
                 "y": val_y
                 }, 
         "test": {"x": test_x, 
                  "x_shaped": test_x_shaped, 
                  "y": test_y
                  }
         }
    return d

d_s = split(df_s, s_img_rows, s_img_cols)
d_t = split(df_t, t_img_rows, t_img_cols)

In [None]:
# View
def view(d):

    # Train/val/test % check
    print("Train/Val/Test Distribution")
    t = 0
    for s in d:
        t += d[s]["x"].shape[0]
    for s in d:
        x = d[s]["x"].shape[0]
        print("Set:", s, "\tRow Count:", x, "\tPercent:", x/t)
    print()

    # Label distribution w/in set
    print("Label Distribution Within a Set")
    t = {}
    for s in d:
        t.update({s: {"total": 0}})
        for l in sorted(list(d[s]["y"]["label"].unique())):
            x = d[s]["y"].loc[d[s]["y"]["label"] == l].shape[0]
            t[s].update({l: x})
            t[s]["total"] += x
    for s in d:
        for l in sorted(list(d[s]["y"]["label"].unique())):
            print("Set:", s, "\tLabel:", l, "\tRow Count:", t[s][l], "\tPercent", t[s][l] / t[s]["total"])
        print()

    # Lable distrubution between sets
    print("Label Distribution Between Sets")
    t = {}
    for l in sorted(list(d["test"]["y"]["label"].unique())):
        t.update({l: {"total": 0}})
        for s in d:
            x = d[s]["y"].loc[d[s]["y"]["label"] == l].shape[0]
            t[l].update({s: x})
            t[l]["total"] += x
    for l in sorted(list(d["test"]["y"]["label"].unique())):
        for s in d:
            print("Label:", l, "\tSet:", s, "\tRow Count:", t[l][s], "\tPercent", t[l][s] / t[l]["total"])
        print()

In [None]:
view(d_s)

Train/Val/Test Distribution
Set: train 	Row Count: 16478 	Percent: 0.799980580638897
Set: val 	Row Count: 2060 	Percent: 0.1000097096805515
Set: test 	Row Count: 2060 	Percent: 0.1000097096805515

Label Distribution Within a Set
Set: train 	Label: 1 	Row Count: 2760 	Percent 0.16749605534652265
Set: train 	Label: 2 	Row Count: 2455 	Percent 0.1489865274912004
Set: train 	Label: 3 	Row Count: 2254 	Percent 0.13678844519966016
Set: train 	Label: 4 	Row Count: 2826 	Percent 0.17150139580046123
Set: train 	Label: 5 	Row Count: 3057 	Percent 0.18552008738924627
Set: train 	Label: 6 	Row Count: 3126 	Percent 0.18970748877290933

Set: val 	Label: 1 	Row Count: 322 	Percent 0.1563106796116505
Set: val 	Label: 2 	Row Count: 324 	Percent 0.15728155339805824
Set: val 	Label: 3 	Row Count: 284 	Percent 0.1378640776699029
Set: val 	Label: 4 	Row Count: 371 	Percent 0.18009708737864077
Set: val 	Label: 5 	Row Count: 388 	Percent 0.1883495145631068
Set: val 	Label: 6 	Row Count: 371 	Percent 0.180097

In [None]:
view(d_t)

Train/Val/Test Distribution
Set: train 	Row Count: 3819 	Percent: 0.7999581064097193
Set: val 	Row Count: 477 	Percent: 0.09991621281943863
Set: test 	Row Count: 478 	Percent: 0.10012568077084207

Label Distribution Within a Set
Set: train 	Label: 1 	Row Count: 362 	Percent 0.09478921183555905
Set: train 	Label: 2 	Row Count: 1220 	Percent 0.31945535480492276
Set: train 	Label: 3 	Row Count: 455 	Percent 0.11914113642314741
Set: train 	Label: 4 	Row Count: 1412 	Percent 0.36973029588897616
Set: train 	Label: 5 	Row Count: 212 	Percent 0.05551191411364231
Set: train 	Label: 6 	Row Count: 158 	Percent 0.04137208693375229

Set: val 	Label: 1 	Row Count: 48 	Percent 0.10062893081761007
Set: val 	Label: 2 	Row Count: 171 	Percent 0.3584905660377358
Set: val 	Label: 3 	Row Count: 59 	Percent 0.12368972746331237
Set: val 	Label: 4 	Row Count: 164 	Percent 0.3438155136268344
Set: val 	Label: 5 	Row Count: 22 	Percent 0.04612159329140461
Set: val 	Label: 6 	Row Count: 13 	Percent 0.027253668763

# To File

In [None]:
# Write to csv
def to_csv_file(d, k, target_dir):
    target_dir = target_dir[:-1] + "_preprocessed/"
    for s in d:
        print(s)
        for xy in d[s]:
            print("\t", xy)
            if not os.path.exists(target_dir):
                os.makedirs(target_dir)
            f_name = target_dir + k + "_" + s + "_" + xy
            if xy != "x_shaped":
                d[s][xy].to_csv(f_name + ".csv", header=True, index=False)
            else:
                with open(f_name + ".pickle", "wb") as file:
                    pickle.dump(d[s][xy], file)

to_csv_file(d_s, "source", source_dir)
to_csv_file(d_t, "target", target_dir)

train
	 x
	 x_shaped
	 y
val
	 x
	 x_shaped
	 y
test
	 x
	 x_shaped
	 y
train
	 x
	 x_shaped
	 y
val
	 x
	 x_shaped
	 y
test
	 x
	 x_shaped
	 y
