In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score, mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn import tree

pd.set_option("display.max.columns", None)
pd.set_option("display.max_rows", None)

In [None]:
#read in the data

data = pd.read_csv('../data/clean_data.csv')
data.columns

## Define a function running the model

In [None]:
def run_decision_tree(DROP_LIST, VALUE, FEATURES):
    
#function training the decision tree model and giving some basic statistics

# DROP_LIST - columns to be dropped from original dataframe
# VALUE - target of the model
# FEATURES - features of the model
#it's important that FEATURES are in the same order as in the data!!!

    #drop the columns we don't need
    df = data.drop(DROP_LIST, axis=1)
    
    #define features and value
    X = df.drop(VALUE, axis=1)
    y = df[VALUE]
    
    #create train and test datasets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 100)
    
    #train the model
    model = DecisionTreeClassifier(random_state = 100)
    
    model.fit(X_train,y_train)
    
    #calculate accuracy
    predict_train = model.predict(X_train)
    predict_val = model.predict(X_val)
    accuracy_train = accuracy_score(y_train,predict_train)
    print("Accuracies")
    print('accuracy_score on train dataset : ', accuracy_train)
    accuracy_val = accuracy_score(y_val,predict_val)
    print('accuracy_score on test dataset : ', accuracy_val)
    print('\n')
    print('Depth of the Decision Tree :', model.get_depth())
    print('\n')
    
    
    #calculate feature importances
    importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(model.feature_importances_,3)})
    importances = importances.sort_values('importance',ascending=False)
    print('Importances')
    print(importances)
    print('\n')  
    
    #print classification report
    print('Classification report')
    print(classification_report(y_val, predict_val))
    print('\n')
    
    #print normalized confusion matrix   
    matrix = plot_confusion_matrix(model, X_val, y_val, cmap=plt.cm.Blues, normalize='true')
    matrix.ax_.set_title('Confusion Matrix')
    fig = plt.gcf()
    fig.set_size_inches(7, 7)
    plt.show()
    
    #draw the decision tree
    plt.figure(figsize=(30,30))
    tree.plot_tree(model, fontsize=8, feature_names = FEATURES)
  
    
    
    
    return

## K-fold cross-validation - define function

In [None]:
def validate_decision_tree(DROP_LIST, VALUE):

#function running 10-fold cross validation for the decision tree

# DROP_LIST - columns to be dropped from original dataframe
# VALUE - target of the model

    
    #drop the columns we don't need
    df = data.drop(DROP_LIST, axis=1)
    
    #define features and value
    X = df.drop(VALUE, axis=1)
    y = df[VALUE]
    
    #train the model
    model = DecisionTreeClassifier(random_state = 100)

    #perform cross validation
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
    scores = cross_val_score(model, X, y, cv=cv, n_jobs=1)
 
    print('Cross Validation accuracy scores: %s' % scores)
 
    print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))
    
    return

## Decision Tree 1
#### Features = 'capacity', 'vib1', 'vib2', 'amp_uni', 'mic1', 'mic2'
#### Value = 'type_cat'

In [None]:
drop_list1 = ['Unnamed: 0', 'type', 'capacity_cat', 'pump_cat',
                'anomaly', 'anomaly_cat','anomaly_binary', 'datetime', 'date', 'time',
                'vib1_x', 'vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z', 'amp1', 'amp2', 'pump', 'anomaly_binary_cat']

run_decision_tree(drop_list1, 'type_cat', ['capacity', 'vib1', 'vib2', 'amp_uni', 'mic1', 'mic2'])

In [None]:
validate_decision_tree(drop_list1, 'type_cat')

## Decision Tree 2
#### Features = 'capacity', 'vib1_x', 'vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z', 'amp1', 'amp2', 'mic1', 'mic2'
#### Value = 'type_cat'

In [None]:
drop_list2 = ['Unnamed: 0', 'type', 'capacity_cat', 'pump_cat',
                'anomaly', 'anomaly_cat','anomaly_binary', 'datetime', 'date', 'time', 'pump', 'anomaly_binary_cat', 'vib1', 'vib2', 'amp_uni']

run_decision_tree(drop_list2, 'type_cat', ['capacity', 'vib1_x', 'vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z', 'amp1', 'amp2', 'mic1', 'mic2'])

In [None]:
validate_decision_tree(drop_list2, 'type_cat')

## Decision Tree 3
#### Features = 'vib1',  'vib2', 'amp_uni', 'mic1', 'mic2'
#### Value = 'type_cat'

In [None]:
drop_list3 = ['Unnamed: 0', 'type', 'capacity', 'capacity_cat', 'pump_cat',
                'anomaly', 'anomaly_cat','anomaly_binary', 'datetime', 'date', 'time',
                'vib1_x', 'vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z', 'amp1', 'amp2', 'pump', 'anomaly_binary_cat']

run_decision_tree(drop_list3, 'type_cat', ['vib1', 'vib2', 'amp_uni', 'mic1', 'mic2'])

In [None]:
validate_decision_tree(drop_list3, 'type_cat')

## Decision Tree 4
#### Features = 'vib1_x', 'vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z', 'amp1', 'amp2', 'mic1', 'mic2'
#### Value = 'type_cat'

In [None]:
drop_list4 = ['Unnamed: 0', 'type', 'capacity', 'capacity_cat', 'pump_cat',
                'anomaly', 'anomaly_cat','anomaly_binary', 'datetime', 'date', 'time', 'pump', 'anomaly_binary_cat', 'vib1', 'vib2', 'amp_uni']

run_decision_tree(drop_list4, 'type_cat', ['vib1_x', 'vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z', 'amp1', 'amp2', 'mic1', 'mic2'])

In [None]:
validate_decision_tree(drop_list4, 'type_cat')

## Decision Tree 5
#### Features = 'capacity', 'vib1', 'vib2', 'amp_uni', 'mic1', 'mic2'
#### Value = 'anomaly-binary'

In [None]:
drop_list5 = ['Unnamed: 0', 'type', 'type_cat', 'capacity_cat', 'pump_cat',
                'anomaly', 'anomaly_cat', 'datetime', 'date', 'time',
                'vib1_x', 'vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z', 'amp1', 'amp2', 'pump', 'anomaly_binary_cat']

run_decision_tree(drop_list5, 'anomaly_binary', ['capacity', 'vib1', 'vib2', 'amp_uni', 'mic1', 'mic2'])

In [None]:
validate_decision_tree(drop_list5, 'anomaly_binary')

## Decision Tree 6
#### Features = 'capacity', 'vib1_x','vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z', 'amp1', 'amp2', 'mic1', 'mic2'
#### Value = 'anomaly-binary'

In [None]:
drop_list6 = ['Unnamed: 0', 'type', 'type_cat', 'capacity_cat', 'pump_cat',
                'anomaly', 'anomaly_cat', 'datetime', 'date', 'time',
                'vib1','vib2', 'amp_uni', 'pump', 'anomaly_binary_cat']

run_decision_tree(drop_list6, 'anomaly_binary', ['capacity', 'vib1_x','vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z', 'amp1', 'amp2', 'mic1', 'mic2'])

In [None]:
validate_decision_tree(drop_list6, 'anomaly_binary')

## Decision Tree 7
#### Features = 'vib1', 'vib2', 'amp_uni', 'mic1', 'mic2'
#### Value = 'anomaly-binary'

In [None]:
drop_list7 = ['Unnamed: 0', 'type', 'capacity', 'type_cat', 'capacity_cat', 'pump_cat',
                'anomaly', 'anomaly_cat', 'datetime', 'date', 'time',
                'vib1_x', 'vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z', 'amp1', 'amp2', 'pump', 'anomaly_binary_cat']

run_decision_tree(drop_list7, 'anomaly_binary', ['vib1', 'vib2', 'amp_uni', 'mic1', 'mic2'])

In [None]:
validate_decision_tree(drop_list7, 'anomaly_binary')

## Decision Tree 8
#### Features = 'vib1_x','vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z' 'amp1', 'amp2', 'mic1', 'mic2'
#### Value = 'anomaly-binary'

In [None]:
drop_list8 = ['Unnamed: 0', 'type', 'type_cat', 'capacity', 'capacity_cat', 'pump_cat',
                'anomaly', 'anomaly_cat', 'datetime', 'date', 'time',
                'vib1','vib2', 'amp_uni', 'pump', 'anomaly_binary_cat']

run_decision_tree(drop_list8, 'anomaly_binary', ['vib1_x','vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z', 'amp1', 'amp2', 'mic1', 'mic2'])


In [None]:
validate_decision_tree(drop_list8, 'anomaly_binary')