# Data understanding and preprocessing

In [25]:
'''
Import block :D
''' 

import pandas as pd
from collections import Counter 
from collections import defaultdict
import random
import math
import matplotlib.pyplot as plt
import numpy as np
import os

# test your pandas with this
# pd.test()

In [26]:
'''
Get the train and test data from the csv files
'''

# the train data and labels
train_feat = pd.read_csv("train_feat.csv")
train_label = pd.read_csv("train_label.csv")

# the test data and labels
test_feat = pd.read_csv("test_feat.csv")
test_label = pd.read_csv("test_label.csv")




# ! Not needed, kept it as a comment to be sure

# label = label.dropna(axis=1)
# merged = feat.merge(label, on='Time')
# merged.to_csv("train.csv", index=False)

# train data and labels combined
# train = pd.read_csv("train.csv")

# !

In [27]:
'''
Test block for pandas understanding
''' 

# print the first 3 rows
print(train_feat.iloc[:3])

# print the HR column
print(train_feat.HR)

# print the list of column names
print(list(train_feat))

            Time   HR    BR  SkinTemp  Posture  Activity  PeakAccel  \
0  1358759280432  101  18.6   -3276.8       12      0.58       0.95   
1  1358759281432  101  17.0   -3276.8       15      0.46       1.02   
2  1358759282432  101  17.0   -3276.8       10      0.19       0.42   

   BRAmplitude  BRNoise  BRConfidence   ...     ROGTime  VerticalMin  \
0          222    65535           255   ...          61        -1.20   
1          221    65535           255   ...          62        -1.25   
2          253    65535           255   ...          63        -1.18   

   VerticalPeak  LateralMin  LateralPeak  SagittalMin  SagittalPeak  AuxADC1  \
0         -0.51       -0.15         0.72        -0.53          0.31      427   
1         -0.39       -0.24         0.77        -0.71         -0.10      430   
2         -0.83       -0.18         0.24        -0.31         -0.01      431   

   AuxADC2  AuxADC3  
0      441      515  
1      444      516  
2      444      517  

[3 rows x 26 col

In [28]:
'''
Remove unmeasured metrics and the metrics that are given by the user

@param  pandas DataFrame    The data
@param  array               The list of metrics that need to be removed
@return pandas DataFrame    The cleaned data
'''
def clean_data(d, metrics = []):
    try:
        # create a hard copy of the data
        data = d.copy()

        # get the unique values per column
        unique_values = unique_vals(data)

        # loop over all columns
        for metric in data:
            # check if we have a numeric or discrete metric, using the threshold
            if unique_values.get(metric) == 1:
                metrics.append(metric)
                
        # remove duplicates from the metrics list
        # then remove those metrics from the data
        return data.drop(columns=set(metrics))
    except:
        
        # print a custom error message
        print("The used list (",metrics,") contains column names that don't exist in the data.")
        
        # return the original data
        return data

In [45]:
'''
Test block
'''

# clean the train data and print it
train_cleaned = clean_data(train_feat, ['BRAmplitude', 'HRV', 'AuxADC1', 'AuxADC2', 'AuxADC3'])
train_cleaned

Unnamed: 0,Time,HR,BR,Posture,Activity,PeakAccel,ECGAmplitude,ECGNoise,HRConfidence,ROGState,ROGTime,VerticalMin,VerticalPeak,LateralMin,LateralPeak,SagittalMin,SagittalPeak
0,1358759280432,101,18.6,12,0.58,0.95,0.00524,0.00030,100,1,61,-1.20,-0.51,-0.15,0.72,-0.53,0.31
1,1358759281432,101,17.0,15,0.46,1.02,0.00510,0.00030,100,1,62,-1.25,-0.39,-0.24,0.77,-0.71,-0.10
2,1358759282432,101,17.0,10,0.19,0.42,0.00510,0.00030,100,1,63,-1.18,-0.83,-0.18,0.24,-0.31,-0.01
3,1358759283432,102,15.6,3,0.20,0.40,0.00492,0.00032,100,1,64,-1.15,-0.95,-0.21,0.06,-0.21,0.14
4,1358759284432,103,15.6,-4,0.23,0.34,0.00492,0.00040,100,1,65,-1.03,-0.95,-0.16,0.01,0.10,0.32
5,1358759285432,104,14.6,-1,0.12,0.19,0.00496,0.00058,100,1,66,-1.12,-0.90,-0.16,0.09,-0.14,0.22
6,1358759286432,105,14.6,1,0.09,0.22,0.00496,0.00082,100,1,67,-1.12,-0.89,-0.10,0.09,-0.25,0.13
7,1358759287432,106,13.9,-4,0.18,0.37,0.00472,0.00076,100,1,68,-1.10,-0.85,-0.10,0.08,-0.12,0.40
8,1358759288432,107,13.9,-8,0.27,0.80,0.00472,0.00250,91,1,69,-1.39,-0.61,-0.37,0.31,-0.26,0.91
9,1358759289432,106,13.3,-10,0.11,0.34,0.00450,0.00266,81,1,70,-1.31,-0.81,-0.03,0.22,-0.16,0.32


In [33]:
'''
Create a csv with the averages of the data (per minute)

@param  pandas DataFrame    The data
@param  string              The name of the target file
'''
def seconds_to_minutes(data,filename):

    if os.path.exists(filename):
        os.remove(filename)
        
    # open the (csv) file
    f = open(filename, 'w+')

    # initizalize the first line
    line = ""

    # create a line of the column names 
    for column in list(data):
        line += column + ","
    line = line[:-1] + "\n"

    # write the first line to the file
    f.write(line)

    # loop over the number of minutes
    for i in range(0, int(len(data.index)),60):
        
        # get the minute of data out of the dataset
        part = data.iloc[i:i+60]
        
        # calculate the means of the columns (as strings so pandas doesn't read everything as float64)
        mean = part.mean().astype(str)
        
        # use the timestamp of the first second (as integer, remove the decimal)
        mean.Time = str(data.iloc[i].Time)[:-2]

        # reset the line
        line = ""
        
        # create the line of values
        for value in mean:
            line += str(value) + ","
        line = line[:-1] + "\n"

        # write the line to the file
        f.write(line)
        
    # close the file
    f.close()

In [34]:
'''
Test block
'''

# create the train data per minute csv file
seconds_to_minutes(train_feat, "train_feat_average.csv")

# create the test data per minute csv file
seconds_to_minutes(test_feat, "test_feat_average.csv")


# get the train feat per minute
train_feat_average = pd.read_csv("train_feat_average.csv")

# get the test feat per minute
test_feat_average = pd.read_csv("test_feat_average.csv")

In [35]:
'''
Get the number of unique values of the metrics/columns

@param  pandas DataFrame    The data
@return int dict            The number of unique values per metric/column
'''
def unique_vals(data):
    
    # initialize the dictionary
    unique_vals = dict()
    
    # loop over the metrics
    for metric in data:
        
        # use the set datastructure to remove duplicates and get the length of the set
        unique_vals[metric] = len(set(train_feat[metric]))
        
    # return :D
    return unique_vals

In [36]:
'''
Test block
'''

# print the number of unique values for the train set
print(unique_vals(train_feat))

{'Time': 16020, 'HR': 142, 'BR': 331, 'SkinTemp': 1, 'Posture': 140, 'Activity': 96, 'PeakAccel': 174, 'BRAmplitude': 452, 'BRNoise': 1, 'BRConfidence': 1, 'ECGAmplitude': 155, 'ECGNoise': 166, 'HRConfidence': 95, 'HRV': 104, 'GSR': 1, 'ROGState': 4, 'ROGTime': 2039, 'VerticalMin': 234, 'VerticalPeak': 134, 'LateralMin': 134, 'LateralPeak': 109, 'SagittalMin': 216, 'SagittalPeak': 192, 'AuxADC1': 273, 'AuxADC2': 340, 'AuxADC3': 156}


In [37]:
'''
Get the occurences of every value per metric

@param  pandas DataFrame    The data
@return Counter dict        The occurences of every value per metric 
'''
def occurences(data):
    
    # the occurences datastructure
    occ = defaultdict(Counter)
    
    # loop over the metrics
    for metric in data:
        
        # loop over the values 
        for value in data[metric]:
            
            # add the occurence
            occ[metric][value] += 1
    
    # return :D
    return occ

In [38]:
'''
Test block
'''
o = occurences(train_feat)

#print
print(o['HR'])

Counter({97: 610, 101: 576, 99: 570, 95: 569, 100: 547, 98: 544, 94: 542, 102: 514, 96: 514, 93: 502, 103: 490, 92: 448, 104: 418, 105: 376, 106: 340, 91: 331, 107: 316, 108: 280, 109: 254, 90: 236, 110: 234, 89: 211, 111: 202, 112: 201, 138: 185, 127: 171, 137: 165, 113: 162, 131: 161, 132: 158, 133: 152, 114: 151, 136: 146, 124: 145, 125: 144, 126: 143, 139: 143, 122: 142, 129: 142, 115: 139, 128: 139, 123: 138, 116: 136, 130: 131, 121: 130, 120: 128, 88: 127, 135: 127, 140: 123, 118: 114, 134: 112, 87: 110, 117: 105, 119: 103, 143: 95, 148: 95, 142: 94, 151: 94, 141: 86, 144: 82, 145: 77, 147: 76, 153: 74, 149: 73, 155: 73, 152: 69, 154: 69, 86: 64, 79: 63, 156: 62, 150: 61, 146: 60, 85: 52, 82: 51, 80: 48, 78: 45, 157: 43, 83: 42, 159: 36, 84: 34, 81: 33, 160: 32, 158: 29, 161: 23, 76: 20, 204: 9, 77: 8, 187: 8, 205: 7, 203: 7, 188: 7, 202: 6, 192: 6, 75: 5, 201: 5, 190: 5, 173: 5, 195: 4, 200: 4, 199: 4, 198: 4, 206: 4, 185: 4, 168: 4, 162: 3, 219: 3, 215: 3, 197: 3, 196: 3, 186: 

In [39]:
'''
Get the indices of the labels that match the activity

@param  pandas DataFrame    The labels
@param  string              The activity
@return nparray             The indices of the rows that match the activity
'''
def get_indices(labels, activity):
    
    # get the indices of the rows that match the activity
    # put them in a np array
    return labels.index[labels['activity'] == activity].tolist()

In [40]:
'''
Get the data of a specific activity
@param  pandas DataFrame    The data
@param  pandas DataFrame    The labels
@param  string              The activity
@return pandas DataFrame    The data that match the activity
'''
def data_of_activity(data, labels, activity):
    
    # get the indices of the labels that match the activity
    indices = get_indices(labels, activity)
    
    # get the data of the indices
    return data.iloc[indices]

In [41]:
'''
Test block
'''

# print the data of the towlift entries  
data_of_activity(train_feat_average, train_label, "towlift")

Unnamed: 0,Time,HR,BR,SkinTemp,Posture,Activity,PeakAccel,BRAmplitude,BRNoise,BRConfidence,...,ROGTime,VerticalMin,VerticalPeak,LateralMin,LateralPeak,SagittalMin,SagittalPeak,AuxADC1,AuxADC2,AuxADC3
238,1358776020432,127.25,27.755,-3276.8,-12.05,0.052667,0.115,164.2,65535.0,255.0,...,78.5,-1.051833,-0.933667,0.0235,0.1585,0.109333,0.250833,430.65,437.333333,512.283333
239,1358776140432,121.65,24.74,-3276.8,-12.25,0.062667,0.127833,166.05,65535.0,255.0,...,198.5,-1.0455,-0.925667,-0.050667,0.093833,0.122333,0.269667,431.7,439.316667,513.25
240,1358776200432,120.55,26.18,-3276.8,-12.4,0.0625,0.1335,124.883333,65535.0,255.0,...,96.0,-1.041333,-0.903167,-0.153667,-0.002167,0.095,0.256333,431.616667,440.2,513.883333
241,1358776260432,130.0,29.306667,-3276.8,-8.766667,0.180167,0.347,158.25,65535.0,255.0,...,17.85,-1.214333,-0.801167,-0.155667,0.2,-0.032,0.3575,436.4,445.1,516.633333
247,1358776800432,128.35,21.69,-3276.8,-11.983333,0.048667,0.1085,120.633333,65535.0,255.0,...,160.5,-1.051667,-0.923833,-0.100833,0.028833,0.137,0.268167,433.133333,440.483333,515.333333
248,1358776860432,127.983333,25.533333,-3276.8,-13.783333,0.055333,0.113,133.516667,65535.0,255.0,...,220.5,-1.043167,-0.911167,-0.128833,0.005167,0.141833,0.287667,432.0,439.583333,514.083333
249,1358776920432,124.783333,23.675,-3276.8,-10.883333,0.064667,0.147667,115.5,65535.0,255.0,...,135.833333,-1.06,-0.918833,-0.0655,0.105833,0.097333,0.255167,432.166667,439.75,513.8
250,1358776980432,142.133333,22.681667,-3276.8,-4.316667,0.21,0.412167,218.95,65535.0,255.0,...,15.216667,-1.289333,-0.755667,-0.170333,0.254833,-0.1515,0.271333,429.75,435.616667,512.55


In [None]:
# IGNORE EVERYTHING BELOW THIS CELL

In [None]:
# split the data columns into descrete and numeric metrics, given a threshold
def discrete_numeric_split(data, thres=10):
    
    # the datastructures
    numeric = dict()
    discrete = dict()
    
    # get the unique values
    unique_values = unique_vals(data)
    
    # loop over the metrics
    for metric in data:
        
        # check if we have a numeric or discrete metric, using the threshold
        if unique_values.get(metric) > thres:
            numeric[metric] = data[metric]
        else:
            discrete[metric] = data[metric]
    
    # return :D
    return discrete, numeric

discrete, numeric = discrete_numeric_split(train_feat)

In [None]:
print(len(discrete),discrete)

In [None]:
print(len(numeric),numeric)

In [None]:
# get the ratio of the current activity
def ratio(labels, activity):
    
    # get the total number of labels
    total = len(labels)
    
    # check if we have labels
    if total:
        
        # get the number of labels that are equal to our activity
        activity_count = len([label for label in labels if label == activity])
        
        # return the ratio
        return activity_count/total
    
    # no ratio, return zero
    return 0


# calculate the partial entropy
def entropy_sub(p):
    
    # the log of 0 is NaN
    if p == 0:
        return 0
    
    # return the p*log2(p)
    return p*math.log(p,2)


# calculate the entropy
def entropy(labels):
    
    # initial entropy
    e = 0
    
    # loop over the activities in the label list 
    for activity in list(set(labels)): 
        
        # get the chance of the activity
        p = ratio(labels, activity) 
        
        # calculate the entropy of the current activity
        e -= entropy_sub(p)-entropy_sub(1-p)
    
    # calculate the entropy
    return e


# calculate the entropy after a split 
def split_entropy(labels, indices, N):
    
    # initial entropy
    e = 0
    
    # get the entropy after the split
    for sub_indices in indices:
        
        # get the labels of the indices
        sub_labels = [labels[index] for index in sub_indices if index < N] 
        
        N_labels = len(sub_labels)
        
        # calculate the entropy after the split and normalise it
        e += entropy(labels)
        
        # remove the current sub_labels
        del sub_labels
    
    # return :D
    return (N_labels/N)*e


# calculate the information gain of the split
def information_gain(label_data, indices):
    
    # get the list of labels
    labels = label_data.Label
    
    # get the number of labels
    total = len(labels)
    
    # get the entropy of the labels
    e = entropy(labels)
    
    # get the entropy after the split
    se = split_entropy(labels, indices, total)
    
    # return the information gain
    return e - se

In [None]:
# tests for me to check if the code worked
e = entropy(train_label.Label)
ig = information_gain(train_label, [[11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 122, 133, 144, 155, 166, 177, 188, 199]])

print(e,ig)

#  {'Label': Counter({  'lift':         51,
#                       'lying':        22,
#                       'sitting':      58,
#                       'snowboarding': 51,
#                       'standing':     88    }),
#
#                       'total':        270

In [None]:
class Data(object):
     def __init__(self, discrete, numeric):
        self.discrete = discrete
        self.numeric = numeric
        self.sd = len(discrete)
        self.sn = len(numeric)
        
    def get_discrete(self, metric, index):
        # get the discrete value of the index is valid
        try:
            return self.discrete[metric][index]
        except:
            return -1
    
    def get_numeric(self, metric, index):
        # get the numeric value if the index is valid
        try:
            return self.numeric[metric][index]
        except:
            return -1
        
    def size_discrete(self):
        # return the number of discrete variables
        return self.sd
    
    def size_numeric(self):
        # return the number of numeric variables
        return self.sn