# Data understanding and preprocessing

In [None]:
'''
Import cell :D
''' 

import pandas as pd
from collections import Counter 
from collections import defaultdict
import random
import math
import matplotlib.pyplot as plt
import numpy as np
import os

# test your pandas with this
# pd.test()

In [None]:
'''
Get the train and test data from the csv files
'''

# the train data and labels
train_feat = pd.read_csv("train_feat.csv")
train_label = pd.read_csv("train_label.csv")

# the test data and labels
test_feat = pd.read_csv("test_feat.csv")
test_label = pd.read_csv("test_label.csv")

In [None]:
'''
Test cell for pandas understanding (1/3)
''' 

# print the first 3 rows
train_feat.iloc[:3]

In [None]:
'''
Test cell for pandas understanding (2/3)
''' 

# print the HR column
train_feat.HR

In [None]:
'''
Test cell for pandas understanding (3/3)
''' 

# print the list of column names
list(train_feat)

In [None]:
'''
Remove unmeasured metrics and the metrics that are given by the user

@param  pandas DataFrame    The data
@param  array               The list of metrics that need to be removed
@return pandas DataFrame    The cleaned data
'''
def clean_data(d, metrics = []):
   
    # create a hard copy of the data
    data = d.copy()
    
    try:   
        # get the unique values per column
        unique_values = unique_vals(data)

        # loop over all columns
        for metric in data:
            
            # check if we have a numeric or discrete metric, using the threshold
            if unique_values.get(metric) == 1:
                metrics.append(metric)
        
        # remove duplicates from the metrics list
        # then remove those metrics from the data
        return data.drop(columns=set(metrics))
    except:
        
        # print a custom error message
        print("The used list (",metrics,") contains column names that don't exist in the data.")
        
        # return the original data
        return data

In [None]:
'''
Test cell
'''

# clean the train data and print it
train_cleaned = clean_data(train_feat, ['BRAmplitude', 'HRV', 'AuxADC1', 'AuxADC2', 'AuxADC3'])
train_cleaned

In [None]:
'''
Create a csv with the averages of the data (per minute)

@param  pandas DataFrame    The data (!including the Time column!)
@param  string              The name of the target file
'''
def seconds_to_minutes(data,filename):

    # remove the file if it exists so we start with a clean file
    if os.path.exists(filename):
        os.remove(filename)
        
    # open the (csv) file
    f = open(filename, 'w+')

    # initizalize the first line
    line = ""

    # create a line of the column names 
    for column in list(data):
        line += column + ","
    line = line[:-1] + "\n"

    # write the first line to the file
    f.write(line)

    # loop over the number of minutes
    for i in range(0, int(len(data.index)),60):
        
        # get the minute of data out of the dataset
        part = data.iloc[i:i+60]
        
        # calculate the means of the columns (as strings so pandas doesn't read everything as float64)
        mean = part.mean().astype(str)
        
        # use the timestamp of the first second (as integer, remove the decimal)
        mean.Time = str(data.iloc[i].Time)[:-2]

        # reset the line
        line = ""
        
        # create the line of values
        for value in mean:
            line += str(value) + ","
        line = line[:-1] + "\n"

        # write the line to the file
        f.write(line)
        
    # close the file
    f.close()

In [None]:
'''
Test cell
'''

# create the train data per minute csv file
seconds_to_minutes(train_feat, "train_feat_average.csv")

# create the test data per minute csv file
seconds_to_minutes(test_feat, "test_feat_average.csv")

# get the train feat per minute
train_feat_average = pd.read_csv("train_feat_average.csv")

# get the test feat per minute
test_feat_average = pd.read_csv("test_feat_average.csv")

In [None]:
'''
Get the number of unique values of the metrics/columns

@param  pandas DataFrame    The data
@return default dict        The number of unique values per metric/column
'''
def unique_vals(data):
    
    # initialize the dictionary
    unique_vals = dict()
    
    # loop over the metrics
    for metric in data:
        
        # use the set datastructure to remove duplicates and get the length of the set
        unique_vals[metric] = len(set(train_feat[metric]))
        
    # return :D
    return unique_vals

In [None]:
'''
Test cell
'''

# print the number of unique values for the train set
print(unique_vals(train_feat))

In [None]:
'''
Get the occurences of every value per metric

@param  pandas DataFrame    The data
@return Counter dict        The occurences of every value per metric 
'''
def occurences(data):
    
    # the occurences datastructure
    occ = defaultdict(Counter)
    
    # loop over the metrics
    for metric in data:
        
        # loop over the values 
        for value in data[metric]:
            
            # add the occurence
            occ[metric][value] += 1
    
    # return :D
    return occ

In [None]:
'''
Test cell
'''
o = occurences(train_feat)

#print
print(o['HR'])

In [None]:
'''
Get the indices of the labels that match the activity

@param  pandas DataFrame    The labels
@param  string              The activity
@return nparray             The indices of the rows that match the activity
'''
def get_indices(labels, activity):
    
    # get the indices of the rows that match the activity
    # put them in a np array
    return labels.index[labels['Label'] == activity].tolist()

In [None]:
'''
Get the data of a specific activity

@param  pandas DataFrame    The data
@param  pandas DataFrame    The labels
@param  string              The activity
@return pandas DataFrame    The data that match the activity
'''
def data_of_activity(data, labels, activity):
    
    # get the indices of the labels that match the activity
    indices = get_indices(labels, activity)
    
    # get the data of the indices
    return data.iloc[indices]

In [None]:
'''
Test cell
'''

# print the data of the towlift entries  
data_of_activity(train_feat_average, train_label, "towlift")

In [None]:
'''
Use the beginning of the train set as the validation set (first 60 minutes)

@param  pandas DataFrame    The train data
@param  pandas DataFrame    The train labels
@return pandas DataFrame    The splitted train data
@return pandas DataFrame    The splitted validation data
@return pandas DataFrame    The splitted train labels
@return pandas DataFrame    The splitted validation labels
'''
def simple_split(data, labels):
    
    # get the validation part of the data
    validation_feat = data.iloc[:3600]
    
    # get the train part of the data
    train_feat = data.iloc[3600:]
    
    # get the validation part of the labels
    validation_label = labels.iloc[:60]
    
    # get the train part of the labels
    train_label = labels.iloc[60:]
    
    # return the splitted data and labels
    return train_feat, validation_feat, train_label, validation_label

In [None]:
'''
Get the train and validation split of the train data

@param  pandas DataFrame    The train data
@param  pandas DataFrame    The train labels
@param  boolean             Include all labels in the validation set
'''
def train_validation_split(data, labels, include_all = False):
    
    # check if we should include all the activities in the validation set
    if not include_all:
        train_feat, validation_feat, train_label, validation_label = simple_split(data, labels)
    else:
        print("Not implemented yet.")
        return
        
    # get the train part of the data
    train_feat.to_csv(path_or_buf='train_split_feat.csv', index=False)
    
    # get the validation part of the data
    validation_feat.to_csv(path_or_buf='validation_split_feat.csv', index=False)
    
    # get the train part of the labels
    train_label.to_csv(path_or_buf='train_split_label.csv', index=False)
    
    # get the validation part of the labels
    validation_label.to_csv(path_or_buf='validation_split_label.csv', index=False)

In [None]:
'''
Preprocess the data

@param  string    The data type (train, validation, test or online_test)
@param  array     The metrics to remove
@param  boolean   Include the time or not
'''
def preprocess_data_to_csv(data_type, metrics, remove_time):

    # the filename of the completely preprocessed data
    result_filename = 'preprocessed_'+ data_type + '_feat.csv'

    # the train data and labels
    feat = pd.read_csv(data_type + "_feat.csv")

    # clean the data (except for Time)
    cleaned_feat = clean_data(feat, metrics)

    # create the data per minute csv file
    seconds_to_minutes(cleaned_feat, data_type + "_feat_average.csv")

    # get the feat per minute
    feat_average = pd.read_csv(data_type + "_feat_average.csv")

    # remove Time from the data, if we don't want to train on that
    if (remove_time):
        feat_average = clean_data(feat_average, ['Time'])

    # write the data to a csv
    feat_average.to_csv(path_or_buf=result_filename, index=False)

In [None]:
'''
Preprocess all the data and write it to a csv file for the ml models

@param  array      The metrics to remove
@param  boolean    Include the time or not
'''
def preprocess(metrics, remove_time = True):
    
    # the train data and labels
    data = pd.read_csv("train_feat.csv")
    labels = pd.read_csv("train_label.csv")
    
    # split the data and write it to csv files
    train_validation_split(data, labels)
    
    # preprocess the train split of the data
    preprocess_data_to_csv('train_split', metrics, remove_time)
    
    # preprocess the validation split of the data
    preprocess_data_to_csv('validation_split', metrics, remove_time)
    

In [None]:
# preprocess the data
preprocess(['BRAmplitude', 'HRV', 'AuxADC1', 'AuxADC2', 'AuxADC3'], False)

# The cells underneath are not needed, the preprocessing has already been completed

In [None]:
# '''
# Putting it all together in this cell
# '''

# # filename prefix (for the different datasets)
# # choose from : train, test, online_test
# prefix = 'train'

# # the filename of the completely preprocessed data
# result_filename = 'cleaned_'+ prefix + '_feat.csv'

# # the train data and labels
# feat = pd.read_csv(prefix + "_feat.csv")
# label = pd.read_csv(prefix + "_label.csv")

# # clean the data (except for Time)
# cleaned_feat = clean_data(feat, ['BRAmplitude', 'HRV', 'AuxADC1', 'AuxADC2', 'AuxADC3'])

# # create the data per minute csv file
# seconds_to_minutes(cleaned_feat, prefix + "_feat_average.csv")

# # get the feat per minute
# feat_average = pd.read_csv(prefix + "_feat_average.csv")

# # remove Time from the data, since we don't want to train on that
# cleaned_feat_average = clean_data(feat_average, ['Time'])

# # write the data to a csv
# cleaned_feat_average.to_csv(path_or_buf=result_filename, index=False)

In [None]:
# '''
# This cell is not needed, but you can add the labels to the data with this cell

# NOTE: Only do this when Time is still in the data
# '''

# # merge the data and labels on the Time metric
# merged = feat_average.merge(label, on='Time')

# # optional: remove Time from the data
# # merged = clean_data(merged, ['Time'])

# # write to a csv
# merged.to_csv('labeled_'+result_filename, index=False)

# # train data and labels combined
# labeled = pd.read_csv('labeled_'+result_filename)
