# Data Splitting
This notebook shows how to read the csv data into pd.DataFrames

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

## Read CSV data  
Shows how to read different csv data into one pandas Dataframe with each row as data measure with information about the participant.shuffle

In [2]:
def get_ds_infos():
    """
    Read the file includes data subject information.
    
    Data Columns:
    0: code [1-24]
    1: weight [kg]
    2: height [cm]
    3: age [years]
    4: gender [0:Female, 1:Male]
    
    Returns:
        A pandas DataFrame that contains inforamtion about data subjects' attributes 
    """ 

    dss = pd.read_csv(os.path.join(motion_sense_dir, "data_subjects_info.csv"))
    print("[INFO] -- Data subjects' information is imported.")
    
    return dss

def set_data_types(data_types=["userAcceleration"]):
    """
    Select the sensors and the mode to shape the final dataset.
    
    Args:
        data_types: A list of sensor data type from this list: [attitude, gravity, rotationRate, userAcceleration] 

    Returns:
        It returns a list of columns to use for creating time-series from files.
    """
    dt_list = []
    for t in data_types:
        if t != "attitude":
            dt_list.append([t+".x",t+".y",t+".z"])
        else:
            dt_list.append([t+".roll", t+".pitch", t+".yaw"])

    return dt_list


def creat_time_series(dt_list, act_labels, trial_codes, mode="mag", labeled=True):
    """
    Args:
        dt_list: A list of columns that shows the type of data we want.
        act_labels: list of activites
        trial_codes: list of trials
        mode: It can be "raw" which means you want raw data
        for every dimention of each data type,
        [attitude(roll, pitch, yaw); gravity(x, y, z); rotationRate(x, y, z); userAcceleration(x,y,z)].
        or it can be "mag" which means you only want the magnitude for each data type: (x^2+y^2+z^2)^(1/2)
        labeled: True, if we want a labeld dataset. False, if we only want sensor values.

    Returns:
        It returns a time-series of sensor data.
    
    """
    num_data_cols = len(dt_list) if mode == "mag" else len(dt_list*3)

    if labeled:
        dataset = np.zeros((0,num_data_cols+7)) # "7" --> [act, code, weight, height, age, gender, trial] 
    else:
        dataset = np.zeros((0,num_data_cols))
        
    ds_list = get_ds_infos()
    
    print("[INFO] -- Creating Time-Series")
    for sub_id in ds_list["code"]:
        for act_id, act in enumerate(act_labels):
            for trial in trial_codes[act_id]:
                fname = os.path.join(motion_sense_dir, 'A_DeviceMotion_data/'+act+'_'+str(trial)+'/sub_'+str(int(sub_id))+'.csv')
                raw_data = pd.read_csv(fname)
                raw_data = raw_data.drop(['Unnamed: 0'], axis=1)
                vals = np.zeros((len(raw_data), num_data_cols))
                for x_id, axes in enumerate(dt_list):
                    if mode == "mag":
                        vals[:,x_id] = (raw_data[axes]**2).sum(axis=1)**0.5        
                    else:
                        vals[:,x_id*3:(x_id+1)*3] = raw_data[axes].values
                    vals = vals[:,:num_data_cols]
                if labeled:
                    lbls = np.array([[act_id,
                            sub_id-1,
                            ds_list["weight"][sub_id-1],
                            ds_list["height"][sub_id-1],
                            ds_list["age"][sub_id-1],
                            ds_list["gender"][sub_id-1],
                            trial          
                           ]]*len(raw_data))
                    vals = np.concatenate((vals, lbls), axis=1)
                dataset = np.append(dataset,vals, axis=0)
    cols = []
    for axes in dt_list:
        if mode == "raw":
            cols += axes
        else:
            cols += [str(axes[0][:-2])]
            
    if labeled:
        cols += ["act", "id", "weight", "height", "age", "gender", "trial"]
    
    dataset = pd.DataFrame(data=dataset, columns=cols)
    return dataset
#________________________________


ACT_LABELS = ["dws","ups", "wlk", "jog", "std", "sit"]
TRIAL_CODES = {
    ACT_LABELS[0]:[1,2,11],
    ACT_LABELS[1]:[3,4,12],
    ACT_LABELS[2]:[7,8,15],
    ACT_LABELS[3]:[9,16],
    ACT_LABELS[4]:[6,14],
    ACT_LABELS[5]:[5,13]
}

### Create DataFrame

In [3]:
## Here we set parameter to build labeld time-series from dataset of "(A)DeviceMotion_data"
## attitude(roll, pitch, yaw); gravity(x, y, z); rotationRate(x, y, z); userAcceleration(x,y,z)
motion_sense_dir = "../raw/motionsense-dataset"
sdt = ["userAcceleration"]
print("[INFO] -- Selected sensor data types: "+str(sdt))    
act_labels = ['std', 'wlk', 'jog']#ACT_LABELS [0:4]
print("[INFO] -- Selected activites: "+str(act_labels))    
trial_codes = [TRIAL_CODES[act] for act in act_labels]
dt_list = set_data_types(sdt)
dataset = creat_time_series(dt_list, act_labels, trial_codes, mode="raw", labeled=True)
print("[INFO] -- Shape of time-Series dataset:"+str(dataset.shape))    
dataset

[INFO] -- Selected sensor data types: ['userAcceleration']
[INFO] -- Selected activites: ['std', 'wlk', 'jog']
[INFO] -- Data subjects' information is imported.
[INFO] -- Creating Time-Series
[INFO] -- Shape of time-Series dataset:(784946, 10)


Unnamed: 0,userAcceleration.x,userAcceleration.y,userAcceleration.z,act,id,weight,height,age,gender,trial
0,0.007192,-0.004249,0.065989,0.0,0.0,102.0,188.0,46.0,1.0,6.0
1,-0.082061,0.004820,0.022905,0.0,0.0,102.0,188.0,46.0,1.0,6.0
2,-0.101612,0.008953,0.002664,0.0,0.0,102.0,188.0,46.0,1.0,6.0
3,-0.113307,0.020615,-0.029093,0.0,0.0,102.0,188.0,46.0,1.0,6.0
4,-0.111187,0.026429,-0.034393,0.0,0.0,102.0,188.0,46.0,1.0,6.0
...,...,...,...,...,...,...,...,...,...,...
784941,-0.090358,0.364632,1.418838,2.0,23.0,74.0,173.0,18.0,0.0,16.0
784942,0.775085,-0.704872,-1.384102,2.0,23.0,74.0,173.0,18.0,0.0,16.0
784943,0.862655,0.054028,-1.188137,2.0,23.0,74.0,173.0,18.0,0.0,16.0
784944,0.660700,0.977416,-1.382904,2.0,23.0,74.0,173.0,18.0,0.0,16.0


In [8]:
dataset.min()

userAcceleration.x     -6.369264
userAcceleration.y     -5.673592
userAcceleration.z     -7.743481
act                     0.000000
id                      0.000000
weight                 48.000000
height                161.000000
age                    18.000000
gender                  0.000000
trial                   6.000000
dtype: float64

## Aggregate DataFrame
At the moment one measure timestep is represented as row.
Further cells shows you how this above representation is transformed into a representation where one row a trial. Measured data are stored as list. Further demographic data except the sex are thrown away.

In [4]:
## this method calculates x, y, z into one value
def calc_consultant(df, xyz=['userAcceleration.x', 'userAcceleration.y', 'userAcceleration.z']):
    cs = []
    for i, row in df.iterrows():
        x = np.array(row[xyz[0]])
        y = np.array(row[xyz[1]])
        z = np.array(row[xyz[2]])
        c = np.square(np.power(x, 2) + np.power(y, 2) + np.power(z, 2))
        cs.append(c)
    return cs

In [5]:
dataset_agg = dataset.groupby(['id', 'act', 'trial']).agg({
    'userAcceleration.x': list,
    'userAcceleration.y': list,
    'userAcceleration.z': list,
    'gender': max
}).reset_index()

dataset_agg['userAcceleration.c'] = calc_consultant(dataset_agg)
# reorder columns 
dataset_agg = dataset_agg[['id', 'act', 'trial', 'gender',  'userAcceleration.x', 'userAcceleration.y', 'userAcceleration.z', 'userAcceleration.c']]
dataset_agg

Unnamed: 0,id,act,trial,gender,userAcceleration.x,userAcceleration.y,userAcceleration.z,userAcceleration.c
0,0.0,0.0,6.0,1.0,"[0.007192000000000001, -0.082061, -0.101612, -...","[-0.004249, 0.0048200000000000005, 0.008953000...","[0.06598899999999999, 0.022905, 0.002664, -0.0...","[1.9574669279047835e-05, 5.302576389694968e-05..."
1,0.0,0.0,14.0,1.0,"[0.019346, -0.069464, -0.073061, -0.037023, -0...","[0.030886, 0.063764, 0.046253, -0.020236, -0.0...","[0.008607, -0.02108, -0.026514, 0.048161, 0.04...","[1.966426109387372e-06, 8.715083940152258e-05,..."
2,0.0,1.0,7.0,1.0,"[0.091702, 0.367412, 0.172093, 0.004957, -0.04...","[0.4158810000000001, 0.004566, -0.217483, -0.2...","[0.093689, -0.106082, -0.16338699999999998, -0...","[0.03615469954549734, 0.021393689171191922, 0...."
3,0.0,1.0,8.0,1.0,"[0.008911, -0.138467, -0.411792, -0.399219, -0...","[0.811516, 0.779104, 0.657027, 0.419755, -0.01...","[0.390558, 0.459506, 0.080446, 0.1086099999999...","[0.65800192071727, 0.7011079925897511, 0.36933..."
4,0.0,1.0,15.0,1.0,"[-0.534569, 0.02255, 0.522461, 0.556612, 0.394...","[-0.158566, 0.026424, 0.306722, 0.253333, -0.1...","[0.918927, 0.394119, -0.095133, -0.207446, 0.0...","[1.3347967057319228, 0.024503680978231166, 0.1..."
...,...,...,...,...,...,...,...,...
163,23.0,1.0,7.0,0.0,"[-0.0049770000000000005, -0.105378, -0.239922,...","[-0.018604, 0.050151, 0.135729, 0.045206, 0.04...","[-0.353208, -0.314363, -0.348385, -0.141423, -...","[0.01565670872303731, 0.012643594992173999, 0...."
164,23.0,1.0,8.0,0.0,"[-1.425842, 0.114683, 0.39819, 0.698681, -0.06...","[1.681821, -0.123146, -0.219326, 0.377987, 1.4...","[0.544728, -0.177449, -0.279697, -0.318708, -1...","[26.607810043759745, 0.0035766709663875088, 0...."
165,23.0,1.0,15.0,0.0,"[-1.452646, -1.252442, -0.32668, 0.384259, 0.9...","[0.487609, 0.5197470000000001, 0.27915, 0.7597...","[0.243056, 0.316493, 0.188983, -0.110478, -0.5...","[5.793741223768068, 3.759394194166553, 0.04855..."
166,23.0,2.0,9.0,0.0,"[0.091297, -0.264581, -0.221304, -1.643414, -1...","[-0.535709, -0.124192, 0.272271, 3.05774900000...","[0.6634329999999999, -0.172721, -0.467133, 0.3...","[0.5409052659747449, 0.013284706758845154, 0.1..."


In [6]:
dataset_agg.to_hdf('../mydata/motionsense_aggregated.h5', key='agg_df')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_values] [items->Index(['userAcceleration.x', 'userAcceleration.y', 'userAcceleration.z',
       'userAcceleration.c'],
      dtype='object')]

  pytables.to_hdf(


## Split DataFrame
Split DataFrame into Training-, Validation- and Testsets.  
The methods tries to balance the datasets regarding columns: gender, trial_length and act.

In [40]:
def dataframe_split(df, test_size, shuffle=False, random_state=None):
    train_dfs = []
    test_dfs = []
    
    for gender in df.gender.unique():
        gender_df = df[df['gender'] == gender]
        gender_df['trial_length'] = gender_df.trial.map(lambda trial: 'short' if (trial > 9) else 'long')
        # gender_dfs.append(gender_df)
        #gender_df = shuffle(gender_df, random_state=random_state)

        for trial_length in gender_df.trial_length.unique():
            trial_length_df = gender_df[gender_df['trial_length'] == trial_length]
            #trial_length_df = shuffle(trial_length_df, random_state=random_state)
            for act in trial_length_df.act.unique():
                act_df = trial_length_df[trial_length_df['act'] == act]
                #act_df = shuffle(act_df, random_state=random_state)
                train_df, test_df = train_test_split(act_df, test_size=test_size, shuffle=shuffle, random_state=random_state)

                train_dfs.append(train_df)
                test_dfs.append(test_df)
    return pd.concat(train_dfs).reset_index(drop=True), pd.concat(test_dfs).reset_index(drop=True)

In [41]:
dataset_agg = pd.read_hdf('../mydata/motionsense_aggregated.h5')

### splitting

In [52]:
train_df, val_test_df = dataframe_split(dataset_agg, test_size=0.3) 
val_df, test_df = dataframe_split(val_test_df, test_size=0.4)
target_path = '../mydata'

print('Save Train DataFrame.')
train_df.to_hdf(os.path.join(target_path, 'train_df.h5'), key='train')

print('Save Validation DataFrame.')
val_df.to_hdf(os.path.join(target_path, 'val_df.h5'), key='val')

print('Save Test DataFrame.')
test_df.to_hdf(os.path.join(target_path, 'test_df.h5'), key='test')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


Save Train DataFrame.
Save Validation DataFrame.
Save Test DataFrame.


## check results

In [53]:
print('Train:', len(train_df))
print('Val:', len(val_df))
print('Test:', len(test_df))

Train: 113
Val: 28
Test: 27


In [54]:
print(train_df.act.value_counts())
print(val_df.act.value_counts())
print(test_df.act.value_counts())

1.0    49
2.0    32
0.0    32
Name: act, dtype: int64
1.0    12
2.0     8
0.0     8
Name: act, dtype: int64
1.0    11
2.0     8
0.0     8
Name: act, dtype: int64


In [55]:
print(train_df.trial.value_counts())
print(val_df.trial.value_counts())
print(test_df.trial.value_counts())

7.0     17
16.0    16
15.0    16
14.0    16
9.0     16
8.0     16
6.0     16
Name: trial, dtype: int64
16.0    4
15.0    4
14.0    4
9.0     4
7.0     4
8.0     4
6.0     4
Name: trial, dtype: int64
16.0    4
15.0    4
14.0    4
9.0     4
8.0     4
6.0     4
7.0     3
Name: trial, dtype: int64


In [58]:
train_df

Unnamed: 0,id,act,trial,gender,userAcceleration.x,userAcceleration.y,userAcceleration.z,userAcceleration.c,trial_length
0,0.0,0.0,6.0,1.0,"[0.007192000000000001, -0.082061, -0.101612, -...","[-0.004249, 0.0048200000000000005, 0.008953000...","[0.06598899999999999, 0.022905, 0.002664, -0.0...","[1.9574669279047835e-05, 5.302576389694968e-05...",long
1,1.0,0.0,6.0,1.0,"[-0.050953, -0.015589, -0.005096, 0.001417, 0....","[-0.01596, -0.01192, -0.012113, -0.021758, -0....","[0.056272, 0.04063, 0.03825, 0.024487, 0.00585...","[3.620991863979229e-05, 4.144889709867849e-06,...",long
2,3.0,0.0,6.0,1.0,"[-0.021865, -0.019824, -0.022721, -0.02161, -0...","[0.005535, 0.008296, 0.005762, 0.0064040000000...","[-0.003022, 0.002832, 0.001659, 0.005607, 0.00...","[2.681654470532004e-07, 2.2074475432575393e-07...",long
3,5.0,0.0,6.0,1.0,"[0.000742, -0.00287, -0.004672, 0.002813, 0.00...","[0.003216, -2.5e-05, 0.007909000000000001, 0.0...","[0.00229, 0.012204, 0.006470999999999999, 0.00...","[2.604130967824e-10, 2.470402494836988e-08, 1....",long
4,8.0,0.0,6.0,1.0,"[0.006305, -0.010752, -0.014288999999999996, -...","[0.011208, 0.005011, -0.006165, 0.003332, 0.02...","[-0.036472, -0.033296, -0.017369, -0.016557, -...","[2.236756763595539e-06, 1.5608485391024556e-06...",long
...,...,...,...,...,...,...,...,...,...
108,6.0,2.0,16.0,0.0,"[-0.038013, -0.39413, -0.482694, -0.441861, -0...","[-1.7199380000000002, -1.803184, -1.671355, -1...","[0.534516, 0.476559, 0.364118, -0.027996, -0.2...","[10.53222565505175, 13.20537075250675, 9.97929...",short
109,7.0,2.0,16.0,0.0,"[0.013999999999999999, -0.0061920000000000005,...","[-0.14188299999999998, -0.14288199999999998, -...","[-0.032978, -0.057586, -0.078973, -0.106102, -...","[0.0004585737080729153, 0.0005650012139677852,...",short
110,9.0,2.0,16.0,0.0,"[-0.8400489999999999, -0.235474, -0.522702, 0....","[-0.734053, 0.336275, 1.260167, 2.526999, 1.00...","[0.163569, -1.948608, 0.8250940000000001, 1.49...","[1.616129820612543, 15.725999364888075, 6.4618...",short
111,15.0,2.0,16.0,0.0,"[-0.467345, -0.580374, -0.618786, -0.536335999...","[-1.027142, -1.385261, -1.079246, -1.069238, -...","[0.5473060000000001, 0.813357, 0.875975, 0.786...","[2.47425316570051, 8.510823824496729, 5.359226...",short


In [17]:
dataset.columns[:3]

Index(['userAcceleration.x', 'userAcceleration.y', 'userAcceleration.z'], dtype='object')

In [20]:
for col in dataset.columns[:3]:
    col_vals = dataset[col]
    
    print('min: ', col_vals.min(), 'max: ', col_vals.max())

min:  -6.369264 max:  7.120792999999999
min:  -5.673592 max:  7.322722
min:  -7.743481 max:  8.125357000000001


In [15]:
dataset

Unnamed: 0,userAcceleration.x,userAcceleration.y,userAcceleration.z,act,id,weight,height,age,gender,trial
0,0.007192,-0.004249,0.065989,0.0,0.0,102.0,188.0,46.0,1.0,6.0
1,-0.082061,0.004820,0.022905,0.0,0.0,102.0,188.0,46.0,1.0,6.0
2,-0.101612,0.008953,0.002664,0.0,0.0,102.0,188.0,46.0,1.0,6.0
3,-0.113307,0.020615,-0.029093,0.0,0.0,102.0,188.0,46.0,1.0,6.0
4,-0.111187,0.026429,-0.034393,0.0,0.0,102.0,188.0,46.0,1.0,6.0
...,...,...,...,...,...,...,...,...,...,...
784941,-0.090358,0.364632,1.418838,2.0,23.0,74.0,173.0,18.0,0.0,16.0
784942,0.775085,-0.704872,-1.384102,2.0,23.0,74.0,173.0,18.0,0.0,16.0
784943,0.862655,0.054028,-1.188137,2.0,23.0,74.0,173.0,18.0,0.0,16.0
784944,0.660700,0.977416,-1.382904,2.0,23.0,74.0,173.0,18.0,0.0,16.0
