# Data understanding and preprocessing

In [10]:
'''
Import cell :D
''' 

import pandas as pd
from collections import Counter 
from collections import defaultdict
import random
import math
import matplotlib.pyplot as plt
import numpy as np
import os

# test your pandas with this
# pd.test()

In [11]:
'''
Get the train and test data from the csv files
'''

# the train data and labels
train_feat = pd.read_csv("train_feat.csv")
train_label = pd.read_csv("train_label.csv")

# the test data and labels
test_feat = pd.read_csv("test_feat.csv")
test_label = pd.read_csv("test_label.csv")

In [12]:
'''
Test cell for pandas understanding (1/3)
''' 

# print the first 3 rows
train_feat.iloc[:3]

Unnamed: 0,Time,HR,BR,SkinTemp,Posture,Activity,PeakAccel,BRAmplitude,BRNoise,BRConfidence,...,ROGTime,VerticalMin,VerticalPeak,LateralMin,LateralPeak,SagittalMin,SagittalPeak,AuxADC1,AuxADC2,AuxADC3
0,1358759280432,101,18.6,-3276.8,12,0.58,0.95,222,65535,255,...,61,-1.2,-0.51,-0.15,0.72,-0.53,0.31,427,441,515
1,1358759281432,101,17.0,-3276.8,15,0.46,1.02,221,65535,255,...,62,-1.25,-0.39,-0.24,0.77,-0.71,-0.1,430,444,516
2,1358759282432,101,17.0,-3276.8,10,0.19,0.42,253,65535,255,...,63,-1.18,-0.83,-0.18,0.24,-0.31,-0.01,431,444,517


In [13]:
'''
Test cell for pandas understanding (2/3)
''' 

# print the HR column
train_feat.HR

0        101
1        101
2        101
3        102
4        103
5        104
6        105
7        106
8        107
9        106
10       107
11       108
12       108
13       109
14       109
15       109
16       109
17       106
18       103
19        99
20        98
21        97
22        96
23        97
24        95
25        93
26        92
27        89
28        89
29        91
        ... 
15990    119
15991    119
15992    120
15993    123
15994    124
15995    126
15996    127
15997    129
15998    130
15999    131
16000    132
16001    132
16002    133
16003    134
16004    135
16005    136
16006    136
16007    137
16008    138
16009    139
16010    140
16011    140
16012    140
16013    140
16014    139
16015    139
16016    139
16017    139
16018    138
16019    137
Name: HR, Length: 16020, dtype: int64

In [14]:
'''
Test cell for pandas understanding (3/3)
''' 

# print the list of column names
list(train_feat)

['Time',
 'HR',
 'BR',
 'SkinTemp',
 'Posture',
 'Activity',
 'PeakAccel',
 'BRAmplitude',
 'BRNoise',
 'BRConfidence',
 'ECGAmplitude',
 'ECGNoise',
 'HRConfidence',
 'HRV',
 'GSR',
 'ROGState',
 'ROGTime',
 'VerticalMin',
 'VerticalPeak',
 'LateralMin',
 'LateralPeak',
 'SagittalMin',
 'SagittalPeak',
 'AuxADC1',
 'AuxADC2',
 'AuxADC3']

In [15]:
'''
Get the number of unique values of the metrics/columns

@param  pandas DataFrame    The data
@return default dict        The number of unique values per metric/column
'''
def unique_vals(data):
    
    # initialize the dictionary
    unique_vals = dict()
    
    # loop over the metrics
    for metric in data:
        
        # use the set datastructure to remove duplicates and get the length of the set
        unique_vals[metric] = len(set(data[metric]))
        
    # return :D
    return unique_vals

In [16]:
'''
Test cell
'''
from decimal import Decimal

train_set = train_feat.iloc[3600:]

# print the number of unique values for the train set
print(unique_vals(train_set))
print()
print()
print()
print()
print()
print()
print(train_set.var().sort_values(ascending=False))
print()
print()
print()

print(train_set.SkinTemp.unique())
print()
print()
print()

{'Time': 12420, 'HR': 134, 'BR': 328, 'SkinTemp': 1, 'Posture': 137, 'Activity': 96, 'PeakAccel': 173, 'BRAmplitude': 440, 'BRNoise': 1, 'BRConfidence': 1, 'ECGAmplitude': 155, 'ECGNoise': 162, 'HRConfidence': 95, 'HRV': 104, 'GSR': 1, 'ROGState': 4, 'ROGTime': 1919, 'VerticalMin': 233, 'VerticalPeak': 134, 'LateralMin': 132, 'LateralPeak': 108, 'SagittalMin': 209, 'SagittalPeak': 191, 'AuxADC1': 258, 'AuxADC2': 320, 'AuxADC3': 145}






Time            1.688524e+13
HRV             5.376258e+08
ROGTime         1.745854e+05
BRAmplitude     5.449234e+03
AuxADC2         7.185962e+02
HR              4.392957e+02
AuxADC1         3.796610e+02
Posture         3.120764e+02
HRConfidence    1.099046e+02
AuxADC3         9.069297e+01
BR              3.649693e+01
ROGState        5.101821e-01
SagittalMin     1.318267e-01
PeakAccel       9.212248e-02
VerticalMin     7.991195e-02
SagittalPeak    6.376250e-02
VerticalPeak    5.257688e-02
LateralMin      3.604141e-02
LateralPeak     2.382645e-02
Activi

In [None]:
'''
Remove unmeasured metrics and the metrics that are given by the user

@param  pandas DataFrame    The data
@param  array               The list of metrics that need to be removed
@return pandas DataFrame    The cleaned data
'''
def clean_data(d, metrics = []):
   
    # create a hard copy of the data
    data = d.copy()
    
    try:   
        # get the unique values per column
        unique_values = unique_vals(data)

        # loop over all columns
        for metric in data:
            
            # check if we have a numeric or discrete metric, using the threshold
            if unique_values.get(metric) == 1:
                metrics.append(metric)
        
        # remove duplicates from the metrics list
        # then remove those metrics from the data
        return data.drop(columns=set(metrics))
    except:
        
        # print a custom error message
        print("The used list (",metrics,") contains column names that don't exist in the data.")
        
        # return the original data
        return data

In [None]:
'''
Test cell
'''

# clean the train data and print it
train_cleaned = clean_data(train_feat, ['BRAmplitude', 'HRV', 'AuxADC1', 'AuxADC2', 'AuxADC3'])
train_cleaned

In [20]:
'''
Create a csv with the averages of the data (per minute)

@param  pandas DataFrame    The data (!including the Time column!)
@param  string              The name of the target file
'''
def seconds_to_minutes(data,filename):

    # remove the file if it exists so we start with a clean file
    if os.path.exists(filename):
        os.remove(filename)
        
    # open the (csv) file
    f = open(filename, 'w+')

    # initizalize the first line
    line = ""

    # create a line of the column names 
    for column in list(data):
        line += column + ","
    line = line[:-1] + "\n"

    # write the first line to the file
    f.write(line)

    # loop over the number of minutes
    for i in range(0, int(len(data.index)),60):
        
        # get the minute of data out of the dataset
        part = data.iloc[i:i+60]
        
        # calculate the means of the columns (as strings so pandas doesn't read everything as float64)
        mean = part.mean().astype(str)
        
        # use the timestamp of the first second (as integer, remove the decimal)
        mean.Time = str(data.iloc[i].Time)[:-2]

        # reset the line
        line = ""
        
        # create the line of values
        for value in mean:
            line += str(value) + ","
        line = line[:-1] + "\n"

        # write the line to the file
        f.write(line)
        
    # close the file
    f.close()

In [21]:
'''
Test cell
'''

# create the train data per minute csv file
seconds_to_minutes(train_feat, "train_feat_average.csv")

# create the test data per minute csv file
seconds_to_minutes(test_feat, "test_feat_average.csv")

# get the train feat per minute
train_feat_average = pd.read_csv("train_feat_average.csv")

# get the test feat per minute
test_feat_average = pd.read_csv("test_feat_average.csv")

In [None]:
'''
Create a csv with the labels copied 60 times (in the order that they started in)

@param  pandas DataFrame    The labels (!including the Time column!)
@param  string              The name of the target file
'''
def minutes_to_seconds(labels,filename):

    # remove the file if it exists so we start with a clean file
    if os.path.exists(filename):
        os.remove(filename)
        
    # open the (csv) file
    f = open(filename, 'w+')

    # initizalize the first line
    line = ""

    # create a line of the column names 
    for column in list(labels):
        line += column + ","
    line = line[:-1] + "\n"

    # write the first line to the file
    f.write(line)

    # loop over the number of minutes
    for i in range(len(labels.index)):
        for j in range(60):
            
            # reset the line
            line = ""
        
            line += str(labels.iloc[i].Time+j*1000) + "," + labels.iloc[i].Label + "\n"

            # write the line to the file
            f.write(line)

    # close the file
    f.close()

In [None]:
'''
Test cell
'''

# create the train data per minute csv file
minutes_to_seconds(train_label, "train_label_seconds.csv")

# create the test data per minute csv file
minutes_to_seconds(test_label, "test_label_seconds.csv")

# get the train feat per minute
train_label_seconds = pd.read_csv("train_label_seconds.csv")

# get the test feat per minute
test_label_seconds = pd.read_csv("test_label_seconds.csv")

In [24]:
'''
Get the occurences of every value per metric

@param  pandas DataFrame    The data
@return Counter dict        The occurences of every value per metric 
'''
def occurences(data):
    
    # the occurences datastructure
    occ = defaultdict(Counter)
    
    # loop over the metrics
    for metric in data:
        
        # loop over the values 
        for value in data[metric]:
            
            # add the occurence
            occ[metric][value] += 1
    
    # return :D
    return occ

In [27]:
'''
Test cell
'''
o = occurences(train_label)

#print
print(o['Label'])

Counter({'standing': 80, 'snowboarding': 63, 'lift': 49, 'sitting': 48, 'lying': 19, 'towlift': 8})


In [19]:
'''
Get the indices of the labels that match the activity

@param  pandas DataFrame    The labels
@param  string              The activity
@return nparray             The indices of the rows that match the activity
'''
def get_indices(labels, activity):
    
    # get the indices of the rows that match the activity
    # put them in a np array
    return labels.index[labels['Label'] == activity].tolist()

In [17]:
'''
Get the data of a specific activity

@param  pandas DataFrame    The data
@param  pandas DataFrame    The labels
@param  string              The activity
@return pandas DataFrame    The data that match the activity
'''
def data_of_activity(data, labels, activity):
    
    # get the indices of the labels that match the activity
    indices = get_indices(labels, activity)
    
    # get the data of the indices
    return data.iloc[indices]

In [22]:
'''
Test cell
'''

# print the data of the towlift entries  
data_of_activity(train_feat_average, train_label, "towlift")

Unnamed: 0,Time,HR,BR,SkinTemp,Posture,Activity,PeakAccel,BRAmplitude,BRNoise,BRConfidence,...,ROGTime,VerticalMin,VerticalPeak,LateralMin,LateralPeak,SagittalMin,SagittalPeak,AuxADC1,AuxADC2,AuxADC3
238,1358776020432,127.25,27.755,-3276.8,-12.05,0.052667,0.115,164.2,65535.0,255.0,...,78.5,-1.051833,-0.933667,0.0235,0.1585,0.109333,0.250833,430.65,437.333333,512.283333
239,1358776140432,121.65,24.74,-3276.8,-12.25,0.062667,0.127833,166.05,65535.0,255.0,...,198.5,-1.0455,-0.925667,-0.050667,0.093833,0.122333,0.269667,431.7,439.316667,513.25
240,1358776200432,120.55,26.18,-3276.8,-12.4,0.0625,0.1335,124.883333,65535.0,255.0,...,96.0,-1.041333,-0.903167,-0.153667,-0.002167,0.095,0.256333,431.616667,440.2,513.883333
241,1358776260432,130.0,29.306667,-3276.8,-8.766667,0.180167,0.347,158.25,65535.0,255.0,...,17.85,-1.214333,-0.801167,-0.155667,0.2,-0.032,0.3575,436.4,445.1,516.633333
247,1358776800432,128.35,21.69,-3276.8,-11.983333,0.048667,0.1085,120.633333,65535.0,255.0,...,160.5,-1.051667,-0.923833,-0.100833,0.028833,0.137,0.268167,433.133333,440.483333,515.333333
248,1358776860432,127.983333,25.533333,-3276.8,-13.783333,0.055333,0.113,133.516667,65535.0,255.0,...,220.5,-1.043167,-0.911167,-0.128833,0.005167,0.141833,0.287667,432.0,439.583333,514.083333
249,1358776920432,124.783333,23.675,-3276.8,-10.883333,0.064667,0.147667,115.5,65535.0,255.0,...,135.833333,-1.06,-0.918833,-0.0655,0.105833,0.097333,0.255167,432.166667,439.75,513.8
250,1358776980432,142.133333,22.681667,-3276.8,-4.316667,0.21,0.412167,218.95,65535.0,255.0,...,15.216667,-1.289333,-0.755667,-0.170333,0.254833,-0.1515,0.271333,429.75,435.616667,512.55


In [None]:
'''
Use the beginning of the train set as the validation set (first 60 minutes)

@param  pandas DataFrame    The train data
@param  pandas DataFrame    The train labels
@param  boolean             Use the minutes instead of seconds (less data)
@return pandas DataFrame    The splitted train data
@return pandas DataFrame    The splitted validation data
@return pandas DataFrame    The splitted train labels
@return pandas DataFrame    The splitted validation labels
'''
def simple_split(data, labels, less_data):
    
    # get the validation part of the data
    validation_feat = data.iloc[:3600]
    
    # get the train part of the data
    train_feat = data.iloc[3600:]
    
    # change the size of the data depending on the flag
    if less_data:
        size = 60
    else :
        size = 3600
    
    # get the validation part of the labels
    validation_label = labels.iloc[:size]
    
    # get the train part of the labels
    train_label = labels.iloc[size:]
    
    # return the splitted data and labels
    return train_feat, validation_feat, train_label, validation_label

In [None]:
'''
Get the train and validation split of the train data

@param  pandas DataFrame    The train data
@param  pandas DataFrame    The train labels
@param  boolean             Use the minutes instead of seconds (less data)
@param  boolean             Include all labels in the validation set
'''
def train_validation_split(data, labels, less_data, include_all = False):
    
    # check if we should include all the activities in the validation set
    if not include_all:
        train_feat, validation_feat, train_label, validation_label = simple_split(data, labels, less_data)
    else:
        print("Not implemented yet.")
        return
        
    # get the train part of the data
    train_feat.to_csv(path_or_buf='train_split_feat.csv', index=False)
    
    # get the validation part of the data
    validation_feat.to_csv(path_or_buf='validation_split_feat.csv', index=False)
    
    # get the train part of the labels
    train_label.to_csv(path_or_buf='train_split_label.csv', index=False)
    
    # get the validation part of the labels
    validation_label.to_csv(path_or_buf='validation_split_label.csv', index=False)

In [None]:
'''
Preprocess the data

@param  string    The data type (train, validation, test or online_test)
@param  array     The metrics to remove
@param  boolean   Use the minutes instead of seconds (less data)
@param  boolean   Remove the time or not
'''
def preprocess_data_to_csv(data_type, metrics, less_data, remove_time):

    # the filename of the completely preprocessed data
    result_filename = 'preprocessed_'+ data_type + '_feat.csv'

    # the train data and labels
    feat = pd.read_csv(data_type + "_feat.csv")

    # clean the data (except for Time)
    feat = clean_data(feat, metrics)

    # create the data per minute csv file
    if less_data:
        seconds_to_minutes(feat, data_type + "_feat_average.csv")

        # get the feat per minute
        feat = pd.read_csv(data_type + "_feat_average.csv")

    # remove Time from the data, if we don't want to train on that
    if remove_time:
        feat = clean_data(feat, ['Time'])

    # write the data to a csv
    feat.to_csv(path_or_buf=result_filename, index=False)

In [None]:
'''
Preprocess all the data and write it to a csv file for the ml models

@param  array      The metrics to remove
@param  boolean    Use the minutes instead of seconds (less data)
@param  boolean    Remove the time or not
'''
def preprocess(metrics, less_data, remove_time = True):
    
    # the train data and labels
    data = pd.read_csv("train_feat.csv")
    labels = pd.read_csv("train_label.csv")
    
    # convert the labels to seconds
    if not less_data: 
        
        # create the test data per minute csv file
        minutes_to_seconds(train_label, "train_label_seconds.csv")

        # get the train feat per minute
        labels = pd.read_csv("train_label_seconds.csv")
    
    # split the data and write it to csv files
    train_validation_split(data, labels, less_data)
    
    # preprocess the train split of the data
    preprocess_data_to_csv('train_split', metrics, less_data, remove_time)
    
    # preprocess the validation split of the data
    preprocess_data_to_csv('validation_split', metrics, less_data, remove_time)
    
    # preprocess the test data
    preprocess_data_to_csv('test', metrics, less_data, remove_time)
    
    # preprocess the online test data
    preprocess_data_to_csv('online_test', metrics, less_data, remove_time)
    

In [None]:
# preprocess the data (keep the timestamps)
preprocess(['BRAmplitude', 'HRV', 'AuxADC1', 'AuxADC2', 'AuxADC3', 'BRAmplitude', 'BRNoise', 'BRConfidence', 'ECGAmplitude', 'ECGNoise', 'HRConfidence', 'ROGState', 'ROGTime', 'VerticalMin', 'VerticalPeak', 'LateralMin', 'LateralPeak', 'SagittalMin', 'SagittalPeak'], True, True)

# The cells underneath are not needed, the preprocessing has already been completed

In [None]:
# '''
# Putting it all together in this cell
# '''

# # filename prefix (for the different datasets)
# # choose from : train, test, online_test
# prefix = 'train'

# # the filename of the completely preprocessed data
# result_filename = 'cleaned_'+ prefix + '_feat.csv'

# # the train data and labels
# feat = pd.read_csv(prefix + "_feat.csv")
# label = pd.read_csv(prefix + "_label.csv")

# # clean the data (except for Time)
# cleaned_feat = clean_data(feat, ['BRAmplitude', 'HRV', 'AuxADC1', 'AuxADC2', 'AuxADC3'])

# # create the data per minute csv file
# seconds_to_minutes(cleaned_feat, prefix + "_feat_average.csv")

# # get the feat per minute
# feat_average = pd.read_csv(prefix + "_feat_average.csv")

# # remove Time from the data, since we don't want to train on that
# cleaned_feat_average = clean_data(feat_average, ['Time'])

# # write the data to a csv
# cleaned_feat_average.to_csv(path_or_buf=result_filename, index=False)

In [None]:
# '''
# This cell is not needed, but you can add the labels to the data with this cell

# NOTE: Only do this when Time is still in the data
# '''

# # merge the data and labels on the Time metric
# merged = feat_average.merge(label, on='Time')

# # optional: remove Time from the data
# # merged = clean_data(merged, ['Time'])

# # write to a csv
# merged.to_csv('labeled_'+result_filename, index=False)

# # train data and labels combined
# labeled = pd.read_csv('labeled_'+result_filename)
