# Data understanding and preprocessing

In [None]:
'''
Import cell :D
''' 

import pandas as pd
from collections import Counter 
from collections import defaultdict
import random
import math
import matplotlib.pyplot as plt
import numpy as np
import os

# test your pandas with this
# pd.test()

In [None]:
'''
Get the train and test data from the csv files
'''

# the train data and labels
train_feat = pd.read_csv("train_feat.csv")
train_label = pd.read_csv("train_label.csv")

# the test data and labels
test_feat = pd.read_csv("test_feat.csv")
test_label = pd.read_csv("test_label.csv")

In [None]:
'''
Test cell for pandas understanding (1/3)
''' 

# print the first 3 rows
train_feat.iloc[:3]

In [None]:
'''
Test cell for pandas understanding (2/3)
''' 

# print the HR column
train_feat.HR

In [None]:
'''
Test cell for pandas understanding (3/3)
''' 

# print the list of column names
list(train_feat)

In [None]:
'''
Remove unmeasured metrics and the metrics that are given by the user

@param  pandas DataFrame    The data
@param  array               The list of metrics that need to be removed
@return pandas DataFrame    The cleaned data
'''
def clean_data(d, metrics = []):
    try:
        # create a hard copy of the data
        data = d.copy()

        # get the unique values per column
        unique_values = unique_vals(data)

        # loop over all columns
        for metric in data:
            # check if we have a numeric or discrete metric, using the threshold
            if unique_values.get(metric) == 1:
                metrics.append(metric)
                
        # remove duplicates from the metrics list
        # then remove those metrics from the data
        return data.drop(columns=set(metrics))
    except:
        
        # print a custom error message
        print("The used list (",metrics,") contains column names that don't exist in the data.")
        
        # return the original data
        return data

In [None]:
'''
Test cell
'''

# clean the train data and print it
train_cleaned = clean_data(train_feat, ['BRAmplitude', 'HRV', 'AuxADC1', 'AuxADC2', 'AuxADC3'])
train_cleaned

In [None]:
'''
Create a csv with the averages of the data (per minute)

@param  pandas DataFrame    The data (!including the Time column!)
@param  string              The name of the target file
'''
def seconds_to_minutes(data,filename):

    if os.path.exists(filename):
        os.remove(filename)
        
    # open the (csv) file
    f = open(filename, 'w+')

    # initizalize the first line
    line = ""

    # create a line of the column names 
    for column in list(data):
        line += column + ","
    line = line[:-1] + "\n"

    # write the first line to the file
    f.write(line)

    # loop over the number of minutes
    for i in range(0, int(len(data.index)),60):
        
        # get the minute of data out of the dataset
        part = data.iloc[i:i+60]
        
        # calculate the means of the columns (as strings so pandas doesn't read everything as float64)
        mean = part.mean().astype(str)
        
        # use the timestamp of the first second (as integer, remove the decimal)
        mean.Time = str(data.iloc[i].Time)[:-2]

        # reset the line
        line = ""
        
        # create the line of values
        for value in mean:
            line += str(value) + ","
        line = line[:-1] + "\n"

        # write the line to the file
        f.write(line)
        
    # close the file
    f.close()

In [None]:
'''
Test cell
'''

# create the train data per minute csv file
seconds_to_minutes(train_feat, "train_feat_average.csv")

# create the test data per minute csv file
seconds_to_minutes(test_feat, "test_feat_average.csv")


# get the train feat per minute
train_feat_average = pd.read_csv("train_feat_average.csv")

# get the test feat per minute
test_feat_average = pd.read_csv("test_feat_average.csv")

In [None]:
'''
Get the number of unique values of the metrics/columns

@param  pandas DataFrame    The data
@return int dict            The number of unique values per metric/column
'''
def unique_vals(data):
    
    # initialize the dictionary
    unique_vals = dict()
    
    # loop over the metrics
    for metric in data:
        
        # use the set datastructure to remove duplicates and get the length of the set
        unique_vals[metric] = len(set(train_feat[metric]))
        
    # return :D
    return unique_vals

In [None]:
'''
Test cell
'''

# print the number of unique values for the train set
print(unique_vals(train_feat))

In [None]:
'''
Get the occurences of every value per metric

@param  pandas DataFrame    The data
@return Counter dict        The occurences of every value per metric 
'''
def occurences(data):
    
    # the occurences datastructure
    occ = defaultdict(Counter)
    
    # loop over the metrics
    for metric in data:
        
        # loop over the values 
        for value in data[metric]:
            
            # add the occurence
            occ[metric][value] += 1
    
    # return :D
    return occ

In [None]:
'''
Test cell
'''
o = occurences(train_feat)

#print
print(o['HR'])

In [None]:
'''
Get the indices of the labels that match the activity

@param  pandas DataFrame    The labels
@param  string              The activity
@return nparray             The indices of the rows that match the activity
'''
def get_indices(labels, activity):
    
    # get the indices of the rows that match the activity
    # put them in a np array
    return labels.index[labels['Label'] == activity].tolist()

In [None]:
'''
Get the data of a specific activity
@param  pandas DataFrame    The data
@param  pandas DataFrame    The labels
@param  string              The activity
@return pandas DataFrame    The data that match the activity
'''
def data_of_activity(data, labels, activity):
    
    # get the indices of the labels that match the activity
    indices = get_indices(labels, activity)
    
    # get the data of the indices
    return data.iloc[indices]

In [None]:
'''
Test cell
'''

# print the data of the towlift entries  
data_of_activity(train_feat_average, train_label, "towlift")

In [None]:
'''
Putting it all together in this cell
'''

# filename prefix (for the different datasets)
# choose from : train, test, online_test
prefix = 'train'

# the filename of the completely preprocessed data
result_filename = 'cleaned_'+ prefix + '_feat.csv'

# the train data and labels
feat = pd.read_csv(prefix + "_feat.csv")
label = pd.read_csv(prefix + "_label.csv")

# clean the data (except for Time)
cleaned_feat = clean_data(feat, ['BRAmplitude', 'HRV', 'AuxADC1', 'AuxADC2', 'AuxADC3'])

# create the data per minute csv file
seconds_to_minutes(cleaned_feat, prefix + "_feat_average.csv")

# get the feat per minute
feat_average = pd.read_csv(prefix + "_feat_average.csv")

cleaned_feat_average = clean_data(feat_average, ['Time'])

# write the data to a csv
cleaned_feat_average.to_csv(path_or_buf=result_filename, index=False)

In [None]:
'''
This cell is not needed, but you can add the labels to the data with this cell

NOTE: Only do this when Time is still in the data
'''

# label = label.dropna(axis=1)

# merge the data and labels on the Time metric
merged = feat_average.merge(label, on='Time')

# optional: remove Time from the data
# merged = clean_data(merged, ['Time'])

# write to a csv
merged.to_csv('labeled_'+result_filename, index=False)

# train data and labels combined
labeled = pd.read_csv('labeled_'+result_filename)


# ! IGNORE EVERYTHING BELOW FOR NOW !

In [None]:
# split the data columns into descrete and numeric metrics, given a threshold
def discrete_numeric_split(data, thres=10):
    
    # the datastructures
    numeric = dict()
    discrete = dict()
    
    # get the unique values
    unique_values = unique_vals(data)
    
    # loop over the metrics
    for metric in data:
        
        # check if we have a numeric or discrete metric, using the threshold
        if unique_values.get(metric) > thres:
            numeric[metric] = data[metric]
        else:
            discrete[metric] = data[metric]
    
    # return :D
    return discrete, numeric

discrete, numeric = discrete_numeric_split(train_feat)

In [None]:
print(len(discrete),discrete)

In [None]:
print(len(numeric),numeric)

In [None]:
# get the ratio of the current activity
def ratio(labels, activity):
    
    # get the total number of labels
    total = len(labels)
    
    # check if we have labels
    if total:
        
        # get the number of labels that are equal to our activity
        activity_count = len([label for label in labels if label == activity])
        
        # return the ratio
        return activity_count/total
    
    # no ratio, return zero
    return 0


# calculate the partial entropy
def entropy_sub(p):
    
    # the log of 0 is NaN
    if p == 0:
        return 0
    
    # return the p*log2(p)
    return p*math.log(p,2)


# calculate the entropy
def entropy(labels):
    
    # initial entropy
    e = 0
    
    # loop over the activities in the label list 
    for activity in list(set(labels)): 
        
        # get the chance of the activity
        p = ratio(labels, activity) 
        
        # calculate the entropy of the current activity
        e -= entropy_sub(p)-entropy_sub(1-p)
    
    # calculate the entropy
    return e


# calculate the entropy after a split 
def split_entropy(labels, indices, N):
    
    # initial entropy
    e = 0
    
    # get the entropy after the split
    for sub_indices in indices:
        
        # get the labels of the indices
        sub_labels = [labels[index] for index in sub_indices if index < N] 
        
        N_labels = len(sub_labels)
        
        # calculate the entropy after the split and normalise it
        e += entropy(labels)
        
        # remove the current sub_labels
        del sub_labels
    
    # return :D
    return (N_labels/N)*e


# calculate the information gain of the split
def information_gain(label_data, indices):
    
    # get the list of labels
    labels = label_data.Label
    
    # get the number of labels
    total = len(labels)
    
    # get the entropy of the labels
    e = entropy(labels)
    
    # get the entropy after the split
    se = split_entropy(labels, indices, total)
    
    # return the information gain
    return e - se

In [None]:
# tests for me to check if the code worked
e = entropy(train_label.Label)
ig = information_gain(train_label, [[11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 122, 133, 144, 155, 166, 177, 188, 199]])

print(e,ig)

#  {'Label': Counter({  'lift':         51,
#                       'lying':        22,
#                       'sitting':      58,
#                       'snowboarding': 51,
#                       'standing':     88    }),
#
#                       'total':        270

In [None]:
class Data(object):
     def __init__(self, discrete, numeric):
        self.discrete = discrete
        self.numeric = numeric
        self.sd = len(discrete)
        self.sn = len(numeric)
        
    def get_discrete(self, metric, index):
        # get the discrete value of the index is valid
        try:
            return self.discrete[metric][index]
        except:
            return -1
    
    def get_numeric(self, metric, index):
        # get the numeric value if the index is valid
        try:
            return self.numeric[metric][index]
        except:
            return -1
        
    def size_discrete(self):
        # return the number of discrete variables
        return self.sd
    
    def size_numeric(self):
        # return the number of numeric variables
        return self.sn