# Decision Trees
## Imports + Turning the labels into numbers for easier classification

In [None]:
import pandas as pd
from collections import Counter 
from collections import defaultdict
import random
import math
import matplotlib.pyplot as plt
import numpy as np
import os

def numeric_labels(data):
    labels = []
    for label in data:
        if label == 'sitting':
            labels.append(0)
        elif label == 'lift':
            labels.append(1)
        elif label =='towlift':
            labels.append(2)
        elif label =='standing':
            labels.append(3)
        elif label =='lying':
            labels.append(4)
        elif label =='snowboarding':
            labels.append(5)
    return np.asarray(labels)


## Loading in the data and readying variables for training and predicting

In [None]:
preprocessed_train_split_feat = pd.read_csv("preprocessed_train_split_feat.csv")
train_label = pd.read_csv("train_split_label.csv")
preprocessed_validation_split_feat= pd.read_csv("preprocessed_validation_split_feat.csv")
validation_split_label = pd.read_csv("validation_split_label.csv")
pre_test = pd.read_csv("preprocessed_test_feat.csv")
test_online = pd.read_csv("preprocessed_online_test_feat.csv")
test_label = pd.read_csv("test_label.csv")

online_test_label = pd.read_csv("online_test_label.csv")

In [None]:
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from id3 import Id3Estimator
from sklearn.model_selection import GridSearchCV

X = preprocessed_train_split_feat
y = numeric_labels(train_label.Label)
Xt = preprocessed_validation_split_feat
control = numeric_labels(validation_split_label.Label)
#these are the datasets for the test set
X_testing = pre_test
y_testing = numeric_labels(test_label.Label)

#these are the datasets for the online test set
X_test = test_online
y_test = numeric_labels(online_test_label.Label)
print(X_testing)

## Using gridsearch to find the best parameters for the ID3 decision tree, training and running the algorithm

In [None]:
parameters = {'max_depth':list(range(1,11)), 'min_samples_split':list(range(1,21)), 'prune':(True, False), 'gain_ratio':(True, False), 'is_repeating':(True, False)}
scores= 'accuracy'
# clf = GridSearchCV(Id3Estimator(), parameters, cv=5, scoring=scores, n_jobs=-1)
clf = Id3Estimator(gain_ratio= True, is_repeating= False, max_depth= 3, min_samples_split= 13, prune = False) 
clf.fit(X, y)


In [None]:
control = y_test
pred = clf.predict(X_test)
print(accuracy_score(control,pred)* 100)

# print("Best Score:", clf.best_score_)
# print("Best params:", clf.best_params_)

## Results with final data sets (13 metrics) Sklearn

In [None]:
# Metrics used:
#HR,BR,Posture,Activity,PeakAccel,HRConfidence,ROGState,VerticalMin,VerticalPeak,LateralMin,LateralPeak,SagittalMin,SagittalPeak

# validation 30.0%
# test 48.148148148148145%
# online test 56.41025641025641%

## Training set accuracy results ID3 decision tree 
As stated in our report we thought the accuracy results belonged to the validation set, we later found out these were only tested on the training set itself. We left them for a better view of our process. 

In [None]:
## 16 metrics
### HR,BR,Posture,Activity,PeakAccel,ECGAmplitude,ECGNoise,HRConfidence,ROGState,ROGTime,VerticalMin,VerticalPeak,LateralMin,LateralPeak,SagittalMin,SagittalPeak

# Best Score: 0.5314009661835749
# Best params: {'gain_ratio': False, 'is_repeating': True, 'max_depth': 4, 'min_samples_split': 5, 'prune': True}


# Best Score: 0.5314009661835749
# Best params: {'gain_ratio': True, 'is_repeating': True, 'max_depth': 7, 'min_samples_split': 12, 'prune': True}

# Best Score: 0.5024154589371981
# Best params: {'gain_ratio': True, 'is_repeating': True, 'max_depth': 10, 'min_samples_split': 40, 'prune': True}


## 4 metrics (hr br posture activity)
### HR,BR,Posture,Activity

# Best Score: 0.5217391304347826
# Best params: {'gain_ratio': False, 'is_repeating': True, 'max_depth': 14, 'min_samples_split': 13, 'prune': True}

## 5 metrics
### HR,BR,Posture,Activity,PeakAccel

# Best Score: 0.5217391304347826
# Best params: {'gain_ratio': False, 'is_repeating': True, 'max_depth': 8, 'min_samples_split': 6, 'prune': True}


## 11 metrics
### HR,BR,Posture,Activity,PeakAccel,VerticalMin,VerticalPeak,LateralMin,LateralPeak,SagittalMin,SagittalPeak

# Best Score: 0.5458937198067633
# Best params: {'criterion': 'gini', 'max_depth': 3, 'min_impurity_decrease': 0, 'min_samples_split': 2, 'random_state': 2}


## Using gridsearch to find the best parameters for the sklearn decision tree classifier, training and running the model

In [None]:
parameters = {'max_depth':list(range(1,7)), 'min_samples_split':list(range(2,11)), 'min_impurity_decrease':[0, 0.01, 0.1, 0,2, 1], 'random_state':list(range(0,10)), 'criterion':['entropy', 'gini']}
scoring = {'accuracy': make_scorer(accuracy_score),
           'prec': 'precision'}
clf = tree.DecisionTreeClassifier(criterion= 'gini', max_depth= 5, min_impurity_decrease= 0.01, min_samples_split= 2, random_state= 7)
# clf = GridSearchCV(tree.DecisionTreeClassifier(), parameters, cv=5, scoring=scores, n_jobs=-1)
clf.fit(X, y)



In [None]:
control = y_test
pred_des = clf.predict(X_test)
print(accuracy_score(control,pred_des))

# print("Best Score:", clf.best_score_)
# print("Best params:", clf.best_params_)


## Results with final data sets (13 metrics) ID3

In [None]:
## testing
#HR,BR,Posture,Activity,PeakAccel,HRConfidence,ROGState,VerticalMin,VerticalPeak,LateralMin,LateralPeak,SagittalMin,SagittalPeak

# validation 0.26666666666666666%
# test 0.4444444444444444%
# online test 0.5384615384615384%

## Training set accuracy results SKlearn decision tree classifier
As stated in our report we thought the accuracy results belonged to the validation set, we later found out these were only tested on the training set itself. We left them for a better view of our process.


In [None]:
## 16 metrics
### HR,BR,Posture,Activity,PeakAccel,ECGAmplitude,ECGNoise,HRConfidence,ROGState,ROGTime,VerticalMin,VerticalPeak,LateralMin,LateralPeak,SagittalMin,SagittalPeak

# Best Score: 0.6280193236714976
# Best params: {'criterion': 'entropy', 'max_depth': 3, 'min_impurity_decrease': 0.1, 'min_samples_split': 2, 'random_state': 1}

## 12 metrics
### 
# Best Score: 0.5362318840579711
# Best params: {'criterion': 'gini', 'max_depth': 5, 'min_impurity_decrease': 0.01, 'min_samples_split': 2, 'random_state': 3}
# acc = 0.26666666666666666

## 4 metrics 
### HR,BR,Posture,Activity

# Best Score: 0.6183574879227053
# Best params: {'criterion': 'entropy', 'max_depth': 5, 'min_impurity_decrease': 0, 'min_samples_split': 2, 'random_state': 5}


## 5 metrics
### HR,BR,Posture,Activity,PeakAccel

# Best Score: 0.5942028985507246
# Best params: {'criterion': 'entropy', 'max_depth': 5, 'min_impurity_decrease': 0, 'min_samples_split': 2, 'random_state': 3}


## 11 metrics 
### HR,BR,Posture,Activity,PeakAccel,VerticalMin,VerticalPeak,LateralMin,LateralPeak,SagittalMin,SagittalPeak

# Best Score: 0.5458937198067633
# Best params: {'criterion': 'gini', 'max_depth': 3, 'min_impurity_decrease': 0, 'min_samples_split': 2, 'random_state': 2}
