# Data understanding and preprocessing

In [115]:
# import block :D

import pandas as pd
from collections import Counter 
from collections import defaultdict
import random
import math
import matplotlib.pyplot as plt
import numpy as np
import os

# pd.test()

In [126]:
# the train data and labels
train_feat = pd.read_csv("train_feat.csv")
train_label = pd.read_csv("train_label.csv")


test_feat = pd.read_csv("test_feat.csv")
test_label = pd.read_csv("test_label.csv")

# ! Not needed, kept it as a comment to be sure

# label = label.dropna(axis=1)
# merged = feat.merge(label, on='Time')
# merged.to_csv("train.csv", index=False)

# train data and labels combined
# train = pd.read_csv("train.csv")

# !

In [117]:
# the train data
train_feat.iloc[:3]

Unnamed: 0,Time,HR,BR,SkinTemp,Posture,Activity,PeakAccel,BRAmplitude,BRNoise,BRConfidence,...,ROGTime,VerticalMin,VerticalPeak,LateralMin,LateralPeak,SagittalMin,SagittalPeak,AuxADC1,AuxADC2,AuxADC3
0,1358759280432,101,18.6,-3276.8,12,0.58,0.95,222,65535,255,...,61,-1.2,-0.51,-0.15,0.72,-0.53,0.31,427,441,515
1,1358759281432,101,17.0,-3276.8,15,0.46,1.02,221,65535,255,...,62,-1.25,-0.39,-0.24,0.77,-0.71,-0.1,430,444,516
2,1358759282432,101,17.0,-3276.8,10,0.19,0.42,253,65535,255,...,63,-1.18,-0.83,-0.18,0.24,-0.31,-0.01,431,444,517


In [118]:
def clean_data(d, metrics = []):
    
    # hard copy
    data = d.copy()
    
    # get the unique values
    unique_values = unique_vals(data)
    
    for metric in data:
        # check if we have a numeric or discrete metric, using the threshold
        if unique_values.get(metric) == 1:
            metrics.append(metric)
    return data.drop(columns=set(metrics))

In [130]:
# train_feat
# train_feat = clean_data(train_feat, ['BRAmplitude', 'HRV', 'AuxADC1', 'AuxADC2', 'AuxADC3'])
# test_feat = clean_data(test_feat, ['BRAmplitude', 'HRV', 'AuxADC1', 'AuxADC2', 'AuxADC3'])
test_feat.HR

0        97
1       100
2       102
3       103
4       104
5       102
6       102
7       103
8       105
9       106
10      107
11      105
12      103
13      101
14      100
15      101
16      102
17      102
18      102
19      103
20      104
21      103
22      105
23      105
24      105
25      105
26      104
27      102
28      101
29      101
       ... 
3210    124
3211    125
3212    125
3213    126
3214    126
3215    126
3216    125
3217    124
3218    124
3219    124
3220    124
3221    123
3222    123
3223    124
3224    125
3225    125
3226    125
3227    124
3228    122
3229    121
3230    121
3231    121
3232    122
3233    122
3234    123
3235    124
3236    124
3237    125
3238    125
3239    125
Name: HR, Length: 3240, dtype: int64

In [96]:
# create a csv with the averages of the data (per minute)
def seconds_to_minutes(data,filename):

    if os.path.exists(filename):
        os.remove(filename)
        
    # open the (csv) file
    f = open(filename, 'w+')

    # initizalize the first line
    line = ""

    # create a line of the column names 
    for column in list(data):
        line += column + ","
    line = line[:-1] + "\n"

    # write the first line to the file
    f.write(line)

    # loop over the number of minutes
    for i in range(0,int(len(data.index),60)):
        
        # get the minute of data out of the dataset
        part = data.iloc[i:i+60]
        
        # calculate the means of the columns (as strings so pandas doesn't read everything as float64)
        mean = part.mean().astype(str)
        
        # use the timestamp of the first second (as integer, remove the decimal)
        mean.Time = str(data.iloc[i].Time)[:-2]

        # reset the line
        line = ""
        
        # create the line of values
        for value in mean:
            line += str(value) + ","
        line = line[:-1] + "\n"

        # write the line to the file
        f.write(line)
        
    # close the file
    f.close()

In [128]:
seconds_to_minutes(test_feat, "test_feat_average.csv")

In [98]:
# the train feat per minute
train_feat_average = pd.read_csv("train_feat_average.csv")
train_feat_average = pd.read_csv("train_feat_average.csv")

In [99]:
# get the first 3 rows
train_feat_average.iloc[:3]

Unnamed: 0,Time,HR,BR,Posture,Activity,PeakAccel,ECGAmplitude,ECGNoise,HRConfidence,ROGState,ROGTime,VerticalMin,VerticalPeak,LateralMin,LateralPeak,SagittalMin,SagittalPeak
0,1358759280432,97.466667,13.995,27.35,0.122,0.23,0.00481,0.000428,92.883333,1.0,90.5,-0.878,-0.673167,-0.057667,0.126,-0.486167,-0.269333
1,1358759340432,92.25,14.836667,14.15,0.108667,0.2075,0.004908,0.000158,100.0,1.0,150.5,-0.884,-0.742,-0.051833,0.0865,-0.249333,-0.103
2,1358759400432,91.833333,13.843333,-1.166667,0.113,0.199167,0.005037,0.000196,100.0,1.0,210.5,-1.066167,-0.906667,-0.0485,0.159333,-0.0695,0.132833


In [101]:
# get a column (HR for example)
train_feat.HR

0        101
1        101
2        101
3        102
4        103
5        104
6        105
7        106
8        107
9        106
10       107
11       108
12       108
13       109
14       109
15       109
16       109
17       106
18       103
19        99
20        98
21        97
22        96
23        97
24        95
25        93
26        92
27        89
28        89
29        91
        ... 
15990    119
15991    119
15992    120
15993    123
15994    124
15995    126
15996    127
15997    129
15998    130
15999    131
16000    132
16001    132
16002    133
16003    134
16004    135
16005    136
16006    136
16007    137
16008    138
16009    139
16010    140
16011    140
16012    140
16013    140
16014    139
16015    139
16016    139
16017    139
16018    138
16019    137
Name: HR, Length: 16020, dtype: int64

In [102]:
# get the unique values of the metrics
def unique_vals(data):
    
    # the datastruture
    unique_vals = dict()
    
    # loop over the metrics
    for metric in data:
        
        # recast to set to remove duplicates and get the length
        unique_vals[metric] = len(set(train_feat[metric]))
        
    # return :D
    return unique_vals

unique_vals(train_feat)
# for metric in train_feat:
#     print("Number of unique values of ",metric,": ",len(list(set(train_feat[metric]))))

{'Activity': 96,
 'BR': 331,
 'ECGAmplitude': 155,
 'ECGNoise': 166,
 'HR': 142,
 'HRConfidence': 95,
 'LateralMin': 134,
 'LateralPeak': 109,
 'PeakAccel': 174,
 'Posture': 140,
 'ROGState': 4,
 'ROGTime': 2039,
 'SagittalMin': 216,
 'SagittalPeak': 192,
 'Time': 16020,
 'VerticalMin': 234,
 'VerticalPeak': 134}

In [103]:
# get the occurences value of every metric
def occurences(data):
    
    # the occurences datastructure
    occurences = defaultdict(Counter)
    
    # loop over the metrics
    for metric in data:
        
        # loop over the values 
        for value in data[metric]:
            
            # add the occurence
            occurences[metric][value] += 1
    
    # return :D
    return occurences

occurences = occurences(train_feat)
occurences['ROGState']

Counter({0: 3, 1: 12279, 2: 891, 3: 2847})

In [104]:
# get the indices of the labels that match the activity
def get_indices(labels, activity):
    return labels.index[labels['activity'] == activity].tolist()

# get the data of a specific activity
def data_of_activity(data, labels, activity):
    
    # get the indices of the labels of the activity
    indices = get_indices(labels, activity)
    
    # get the data of the indices
    return data.iloc[indices]

# test
data_of_activity(train_feat_average, train_label, "sitting")

Unnamed: 0,Time,HR,BR,Posture,Activity,PeakAccel,ECGAmplitude,ECGNoise,HRConfidence,ROGState,ROGTime,VerticalMin,VerticalPeak,LateralMin,LateralPeak,SagittalMin,SagittalPeak
0,1358759280432,97.466667,13.995,27.35,0.122,0.23,0.00481,0.000428,92.883333,1.0,90.5,-0.878,-0.673167,-0.057667,0.126,-0.486167,-0.269333
1,1358759340432,92.25,14.836667,14.15,0.108667,0.2075,0.004908,0.000158,100.0,1.0,150.5,-0.884,-0.742,-0.051833,0.0865,-0.249333,-0.103
2,1358759400432,91.833333,13.843333,-1.166667,0.113,0.199167,0.005037,0.000196,100.0,1.0,210.5,-1.066167,-0.906667,-0.0485,0.159333,-0.0695,0.132833
3,1358759460432,98.533333,18.543333,-11.166667,0.050333,0.111,0.005118,0.000208,100.0,1.0,270.5,-1.029167,-0.937667,-0.104833,0.036167,0.128833,0.231167
4,1358759520432,100.933333,17.256667,35.666667,0.174167,0.275833,0.00483,0.000223,100.0,1.0,330.5,-0.782333,-0.5855,-0.065833,0.124833,-0.640333,-0.397
5,1358759580432,100.133333,8.1,2.366667,0.187833,0.318667,0.005029,0.000498,91.2,1.0,390.5,-1.091667,-0.784667,-0.047,0.229833,-0.160833,0.139333
6,1358759640432,91.8,9.913333,32.366667,0.069167,0.135167,0.005173,0.000223,99.833333,1.0,450.5,-0.879667,-0.732167,-0.03,0.087833,-0.586333,-0.433
13,1358760300432,102.75,12.405,7.816667,0.169167,0.2905,0.00495,0.000379,99.95,1.0,70.5,-1.0455,-0.796833,-0.096,0.168333,-0.229333,0.035
14,1358760360432,97.8,12.873333,5.083333,0.1565,0.274833,0.004944,0.000354,93.083333,1.0,130.5,-1.061833,-0.833333,-0.08,0.194333,-0.201333,0.082
15,1358760480432,103.1,16.436667,-10.4,0.255167,0.528667,0.00512,0.000264,100.0,1.0,250.5,-1.472333,-0.648833,-0.100167,0.290167,-0.085167,0.384833


In [None]:
# split the data columns into descrete and numeric metrics, given a threshold
def discrete_numeric_split(data, thres=10):
    
    # the datastructures
    numeric = dict()
    discrete = dict()
    
    # get the unique values
    unique_values = unique_vals(data)
    
    # loop over the metrics
    for metric in data:
        
        # check if we have a numeric or discrete metric, using the threshold
        if unique_values.get(metric) > thres:
            numeric[metric] = data[metric]
        else:
            discrete[metric] = data[metric]
    
    # return :D
    return discrete, numeric

discrete, numeric = discrete_numeric_split(train_feat)

In [None]:
print(len(discrete),discrete)

In [None]:
print(len(numeric),numeric)

In [None]:
# get the ratio of the current activity
def ratio(labels, activity):
    
    # get the total number of labels
    total = len(labels)
    
    # check if we have labels
    if total:
        
        # get the number of labels that are equal to our activity
        activity_count = len([label for label in labels if label == activity])
        
        # return the ratio
        return activity_count/total
    
    # no ratio, return zero
    return 0


# calculate the partial entropy
def entropy_sub(p):
    
    # the log of 0 is NaN
    if p == 0:
        return 0
    
    # return the p*log2(p)
    return p*math.log(p,2)


# calculate the entropy
def entropy(labels):
    
    # initial entropy
    e = 0
    
    # loop over the activities in the label list 
    for activity in list(set(labels)): 
        
        # get the chance of the activity
        p = ratio(labels, activity) 
        
        # calculate the entropy of the current activity
        e -= entropy_sub(p)-entropy_sub(1-p)
    
    # calculate the entropy
    return e


# calculate the entropy after a split 
def split_entropy(labels, indices, N):
    
    # initial entropy
    e = 0
    
    # get the entropy after the split
    for sub_indices in indices:
        
        # get the labels of the indices
        sub_labels = [labels[index] for index in sub_indices if index < N] 
        
        N_labels = len(sub_labels)
        
        # calculate the entropy after the split and normalise it
        e += entropy(labels)
        
        # remove the current sub_labels
        del sub_labels
    
    # return :D
    return (N_labels/N)*e


# calculate the information gain of the split
def information_gain(label_data, indices):
    
    # get the list of labels
    labels = label_data.Label
    
    # get the number of labels
    total = len(labels)
    
    # get the entropy of the labels
    e = entropy(labels)
    
    # get the entropy after the split
    se = split_entropy(labels, indices, total)
    
    # return the information gain
    return e - se

In [None]:
# tests for me to check if the code worked
e = entropy(train_label.Label)
ig = information_gain(train_label, [[11, 22, 33, 44, 55, 66, 77, 88, 99, 111, 122, 133, 144, 155, 166, 177, 188, 199]])

print(e,ig)

#  {'Label': Counter({  'lift':         51,
#                       'lying':        22,
#                       'sitting':      58,
#                       'snowboarding': 51,
#                       'standing':     88    }),
#
#                       'total':        270

In [None]:
class Data(object):
     def __init__(self, discrete, numeric):
        self.discrete = discrete
        self.numeric = numeric
        self.sd = len(discrete)
        self.sn = len(numeric)
        
    def get_discrete(self, metric, index):
        # get the discrete value of the index is valid
        try:
            return self.discrete[metric][index]
        except:
            return -1
    
    def get_numeric(self, metric, index):
        # get the numeric value if the index is valid
        try:
            return self.numeric[metric][index]
        except:
            return -1
        
    def size_discrete(self):
        # return the number of discrete variables
        return self.sd
    
    def size_numeric(self):
        # return the number of numeric variables
        return self.sn

In [None]:
class DecisionTree(object):
    def __init__(self, data, labels, tree_type=0, thres=0.1):
        """ Creates a Decision Tree, based on the following arguments:
                data - An array of DataRow objects, each instance containing
                        the discrete and numeric data for one patient
                labels - An array of boolean class labels, each corresponding to a
                        DataRow instance of a patient at the same index. 
                tree_type - 0: create the Tree with the highest IG every node 
                            1: create DiscreteTrees only
                            2: create NumericTrees only
                thres - The cutoff value for IG, to stop splitting the tree.
                        Below this value the node becomes a leaf node and no
                        further splits are made.
            N.B. This function has already been provided and does not need to be
            modified."""
        
        # Store the basic attributes for any DecisionTree
        self.data = data
        self.labels = labels
        self.tree_type = tree_type
        self.thres = thres
        
        # Compute the current ratio of labels and assign this node the most common label
        self.ratio = ratio(self.labels)
        self.label = self.ratio >= 0.5
        
        if self.tree_type == 1:
            # Convert this DecisionTree to a DiscreteTree and perform the split
            discr_tree = DiscreteTree(self)
            self.convert_tree(discr_tree)
        elif self.tree_type == 2:
            # Convert this DecisionTree to a NumericTree and perform the split
            numer_tree = NumericTree(self)
            self.convert_tree(numer_tree)
        else:
            # Create a DiscreteTree and NumericTree, passing all the stored attributes
            # as an argument, and compute the best possible split for each
            discr_tree = DiscreteTree(self)
            numer_tree = NumericTree(self)
            
            # Based on the results of the split computations, replace this generic
            # DecisionTree node with either a DiscreteTree or a NumericTree node
            if discr_tree.info_gain > numer_tree.info_gain:
                self.convert_tree(discr_tree)
            else:
                self.convert_tree(numer_tree)
        
        # Create an empty dictionary to contain the (possible) branches from this node,
        # where the values should be new DecisionTree nodes, or None if not present
        self.branches = defaultdict(lambda: None)
        
        # Check if this split produced a high enough Information Gain to actually create
        # the resulting branches with new split nodes below it, else this is a leaf node
        self.leaf = self.info_gain < self.thres
        if not self.leaf:
            self.create_subtrees()
    
    def store_split_values(self, var_index, var_values, indices, info_gain):
        """ Stores the values of the passed parameters as object attributes. Is intended
            to store the results of a split computation for either a DiscreteTree or a
            NumericTree. The stored attributes are:
                var_index - The DataRow index of the variable on which the split was
                    based.
                var_values - A list of the possible values that this split variable can
                    take, each corresponding to a different branch in the DecisionTree
                indices - A list of index lists, with each list containing the indices
                    defining a subset of the current data and label attributes, as
                    computed by the split. The order of these subsets should match the
                    order of the corresponding var_values used to define the branches
                    of the split.
                info_gain - Information Gain computed for this split
            N.B. This function has already been provided and does not need to be
            modified."""
        self.var_index = var_index
        self.var_values = var_values
        self.indices = indices
        self.info_gain = info_gain
    
    def convert_tree(self, new_tree):
        """ Converts this object to the tree passed as the new_tree parameter.
            All attributes from the new_tree are transfered.
                new_tree - Either a DiscreteTree or a NumericTree instance, to which
                            this object is converted
            N.B. This function has already been provided and does not need to be
            modified."""
        self.__class__ = new_tree.__class__
        self.__dict__ = new_tree.__dict__
    
    def create_subtrees(self):
        """ Creates the different subsets of the current data and labels, and makes a
            a new DecisionTree node for each such subset, based on the indices attribute
            stored after the computed split. These new DecisionTrees are stored in the 
            branches attribute, a dictionary mapping the value of a variable from the
            split to the new DecisionTree created by selecting that value for the split."""

        # loop over the values of the variable
        for metric in self.data:
            
            # (re)set the lists
            sub_data = []
            sub_labels = []
            
            # loop over the indices of that variable value and build the lists
            for index in self.indices[value_index]:
                sub_data.append(self.data[index]) 
                sub_labels.append(self.labels[index])
        
            # create the branch
            self.branches[self.var_values[value_index]] = DecisionTree(sub_data, sub_labels, self.tree_type, self.thres)
        
        
    def classify(self, row):
        """ Traverses the DecisionTree based on the values stored in the data row and
            returns the most common label in the resulting leaf node.
                row - The DataRow object containing the values that are being
                        classified"""
        
        # get the subtree or None
        subtree = self.get_subtree(row)
        
        # classify the row for the subtree
        if subtree:
            return subtree.classify(row)
        
        # use this tree for the classification
        else:
            
            # this is the leaf node, check is the 1 label is more common
            if sum(self.labels)/len(self.labels) >= 0.5:
                return 1.0

            # the 0 label is more common
            else:
                return 0.0
    
    def validate(self, data, labels):
        """ Classifies all the DataRow instances in data and compares the outcome to 
            the provided labels. Returns the percentage of elements that was classified
            correctly.
                data - List of DataRow instances to be classified.
                labels - List of boolean labels each belonging to a DataRow instances at
                    the same index"""
        
        total = len(data)
        correct = 0
        
        # loop over all lables
        for index in range(total):
            
            # get the label of the current index
            label = self.classify(data[index]) 
            
            # match the label
            if label == labels[index]:
                
                # mark as correct
                correct += 1
                
        return 100*correct/total 
        
        
    def split(self):
        """ Must be implemented by the subclass based on the specific type of split performed.
            The function here is only to ensure it is implemented, and should not be modified."""
        raise NotImplementedError
    
    def get_subtree(self, instance):
        """ Must be implemented by the subclass based on the specific type of split performed.
            The function here is only to ensure it is implemented, and should not be modified."""
        raise NotImplementedError


In [None]:
class DiscreteTree(DecisionTree):
    def __init__(self, dtree):
        """ Takes a DecisionTree as initialization parameter and copies all its
            attributes. Then calls the split() function to determine the optimal
            discrete variable to split this subset of the data on.
                dtree - The DecisionTree instance whose attributes are copied to this
                        DiscreteTree instance.
            N.B. This function has already been provided and does not need to be
            modified."""
        self.__dict__ = dtree.__dict__.copy()
        self.split()

    def split(self):
        """ Determines the best discrete variable to split the current dataset on,
            based on the IG resulting from the split. For this best split variable, the
            function stores several resulting attributes from the split, using the
            store_split_values function. See the documentation of store_split_values
            for an overview of what should be stored."""
        
        # initial gain (-1 instead of 0 so that we set the info_gain at least once)
        highest_gain = -1;
        
        # the total number of discrete variables 
        total_vars = self.data[0].size_discrete()
        
        # for every column
        for var_index in range(total_vars):
            
            # create the column
            column = DataColumn(var_index)
            
            # for every row
            for row_index in range(len(self.data)):
                
                # get the DataRow object
                row = self.data[row_index]
                
                # add the row index and value to the column
                value = row.get_discrete(var_index)
                column.add(row_index, value)
            
            # (re)set the values
            info_gain = 0
            indices = []
            var_values = []
            
            # for every unique value in column
            for unique_value in column.unique_values:
                
                # for all rows with that value create indices list
                sub_indices = [row.index for row in column.entries if row.value == unique_value]        
                
                indices.append(sub_indices)
                var_values.append(unique_value)
                
                # calculate information gain
                info_gain = information_gain(self.labels, indices)
                
            # store the entropy if its the one with the highest gain
            if info_gain > highest_gain:
                self.store_split_values(var_index, var_values, indices, info_gain)
                highest_gain = info_gain
            
            # remove the column from our memory
            del column
                
            
    def get_subtree(self, row):
        """ Returns the subtree one branch down, corresponding the to value of
            variable in the DataRow for specific variable based on which the split
            at this node was performed.
            Returns None if the value was not present at the split.
                row - The DataRow object containing the values that are being
                        classified"""
        
        # get the value of the correct column
        value = row.get_discrete(self.var_index)
        
        # get the branch of that value
        try:
            return self.branches[value]
        except:
            return None

In [None]:
class NumericTree(DecisionTree):
    def __init__(self, dtree):
        """ Takes a DecisionTree as initialization parameter and copies all its
            attributes. Then calls the split() function to determine the optimal
            numeric variable to split this subset of the data on.
                dtree - The DecisionTree instance whose attributes are copied to this
                        NumericTree instance.
            N.B. This function has already been provided and does not need to be
            modified."""
        self.__dict__ = dtree.__dict__.copy()
        self.split()

    def split(self):
        """ Determines the best boundary for any numeric variable to split the
            current dataset on, based on the IG resulting from the split. For this
            best split boundary, the function stores several resulting attributes
            from the split, using the store_split_values function. See the
            documentation of store_split_values for an overview of what should
            be stored. In addition, one more attribute is stored in the numeric
            case, namely the boundary value used for the split."""
        
         # initial gain (-1 instead of 0 so that we set the info_gain at least once)
        highest_gain = -1;
        
        # the total number of discrete variables 
        total_vars = self.data[0].size_numeric()
        
        # for every column
        for var_index in range(total_vars):
            
            # create the column
            column = DataColumn(var_index)
            
            # for every row
            for row_index in range(len(self.data)):
                
                # get the DataRow object
                row = self.data[row_index]
                
                # add the row index and value to the column
                value = row.get_numeric(var_index)
                column.add(row_index, value)
            
            # the var values     
            # NOTE:  This was the bug... I had var_values = ['bigger', 'smaller'] but I add the indices that 
            #        have a smaller value first. So it ALWAYS picked the wrong branch 
            #        and still had an accuracy of over 50% 
            var_values = ['smaller', 'bigger']
            
            # for every unique value in column
            for unique_value in column.unique_values:
                
                # (re)set the values
                info_gain = 0
                indices = []
                
                # get the indices that are smaller (or bigger of the second line) than the current unique value 
                smaller_indices = [row.index for row in column.entries if row.value < unique_value]
                bigger_indices = [row.index for row in column.entries if row.value >= unique_value]   
                
                # add the sub indices to the total
                indices.append(smaller_indices)
                indices.append(bigger_indices)
                
                info_gain = information_gain(self.labels, indices)
                 
                # store the entropy if its the one with the highest gain
                if info_gain > highest_gain:
                    self.store_split_values(var_index, var_values, indices, info_gain)
                    highest_gain = info_gain

                    # also store the unique value as the boundary
                    self.boundary = unique_value
            
            # remove the column from the memory
            del column
                    
        
    def get_subtree(self, row):
        """ Returns the subtree one branch down, corresponding to the value of
            variable in the DataRow for specific variable based on which the split
            at this node was performed, and its corresponding boundary.
                row - The DataRow object containing the values that are being
                        classified"""
        
        # get the value of the correct column
        value = row.get_numeric(self.var_index)
        
        # get the branch of that value
        if value < self.boundary:
            return self.branches['smaller']
        else:
            return self.branches['bigger']