In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble  import AdaBoostClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score, mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn import tree

import sys
sys.path.insert(0, '../src/features/')
from custom_metric import *

pd.set_option("display.max.columns", None)

In [None]:
#define our custom metric

def cedric_metric_binary(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    a = 500
    metric = (tn / (tn + fp)) * (tp / (tp + fn)) ** a
    
    return metric

def cedric_metric_nonbinary(y_true, y_pred):
    tn, fp1, fp2, fp3, fn1, tp1, m12, m13, fn2, m21, tp2, m23, fn3, m31, m32, tp3 = confusion_matrix(y_true, y_pred).ravel()
    
    n = tn + fp1 + fp2 + fp3
    p1 = fn1 + tp1 + m12 + m13
    p2 = fn2 + m21 + tp2 + m23
    p3 = fn3 + m31 + m32 + tp3
    
    a = 500
    
    metric = (tn / n) * (tp1 / p1) * (tp2 / p2) * (tp3 / p3) * (1 - fn1 / p1) ** a * (1 - fn2 / p2) ** a * (1 - fn3 / p3) ** a 
    
    return metric
    
    
    
    

In [None]:
#read in the data

data = pd.read_csv('../data/clean_data.csv')
data.columns

## Ada Boost 1 - Multiclass
#### Features = 'capacity', 'vib1', 'vib2', 'amp_uni', 'mic1', 'mic2'
#### Value = 'type_cat'

In [None]:
drop_list = ['type', 'capacity_cat', 'pump_cat',
                'anomaly', 'anomaly_cat','anomaly_binary', 'datetime', 'date', 'time',
                'vib1_x', 'vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z', 'amp1', 'amp2', 'pump', 'anomaly_binary_cat']

value = 'type_cat'


In [None]:
#drop the columns we don't need
df = data.drop(drop_list, axis=1)
    
#define features and value
X = df.drop(value, axis=1)
y = df[value]
        
#create train and test datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 100)
    
#train the model
model = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100))
    
model.fit(X_train,y_train)
    
    
#calculate accuracy
predict_train = model.predict(X_train)
predict_val = model.predict(X_val)
accuracy_train = accuracy_score(y_train,predict_train)
print("Accuracies")
print('accuracy_score on train dataset : ', accuracy_train)
accuracy_val = accuracy_score(y_val,predict_val)
print('accuracy_score on test dataset : ', accuracy_val)
print('\n')
    
    
#calculate feature importances
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(model.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)
print('Importances')
print(importances)
print('\n')

In [None]:
model.get_params()

In [None]:
#start model hypertuning

n_estimators = [50, 75, 100]

base_estimator__max_features = [2,3,4,5,6]

base_estimator__max_depth = [1,2,3,4,5,6,7,8,9,10]

search_params = {'n_estimators': n_estimators,
                 'base_estimator__max_features': base_estimator__max_features,
                 'base_estimator__max_depth': base_estimator__max_depth}

#create train and test datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 100) 

In [None]:
#perform model hypertuning

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)

model_v2 = GridSearchCV(model, search_params, cv=cv, n_jobs=-1)

model_v2.fit(X_train, y_train)

best_params = model_v2.best_params_

print('Best Parameters', model_v2.best_params_)


In [None]:
#train the hypetuned model
model_v3 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, max_depth=7, max_features=3), n_estimators = 100)
model_v3.fit(X_train, y_train)

#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=1)
 
print('Cross Validation accuracy scores: %s' % scores)
 
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

In [None]:
cv = ShuffleSplit(n_splits=5, test_size=0.2, random_state=100)

scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=1, scoring=make_scorer(cedric_metric_nonbinary, greater_is_better=True))
 
print('Cross Validation custom metric scores: %s' % scores)
 
print('Cross Validation custom metric: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

In [None]:
#calculate feature importances
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(model_v3.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)
print('Importances')
print(importances)
print('\n')  
    
#print classification report
print('Classification report')
predict_val = model_v3.predict(X_val)
print(classification_report(y_val, predict_val))
print('\n')
    
#print normalized confusion matrix   
matrix = plot_confusion_matrix(model_v3, X_val, y_val, cmap=plt.cm.Blues, normalize='true')
matrix.ax_.set_title('Confusion Matrix')
fig = plt.gcf()
fig.set_size_inches(7, 7)
plt.show()


## Ada Boost 2 - Binary
#### Features = 'capacity', 'vib1', 'vib2', 'amp_uni', 'mic1', 'mic2'
#### Value = 'anomaly_binary'

In [None]:
drop_list = ['type', 'capacity_cat', 'pump_cat',
                'anomaly', 'anomaly_cat','type_cat', 'datetime', 'date', 'time',
                'vib1_x', 'vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z', 'amp1', 'amp2', 'pump', 'anomaly_binary_cat']


value = 'anomaly_binary'

In [None]:
#drop the columns we don't need
df = data.drop(drop_list, axis=1)

#define features and value
X = df.drop(value, axis=1)
y = df[value]
       
    
#create train and test datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 100)
    
#train the model
model = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100))
    
model.fit(X_train,y_train)
    
    
#calculate accuracy
predict_train = model.predict(X_train)
predict_val = model.predict(X_val)
accuracy_train = accuracy_score(y_train,predict_train)
print("Accuracies")
print('accuracy_score on train dataset : ', accuracy_train)
accuracy_val = accuracy_score(y_val,predict_val)
print('accuracy_score on test dataset : ', accuracy_val)
print('\n')
    
    
#calculate feature importances
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(model.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)
print('Importances')
print(importances)
print('\n')

#custom metric

print('Custom metric')
print(cedric_metric_binary(y_val, predict_val))

In [None]:
model.get_params()

In [None]:
#start model hypertuning

n_estimators = [50, 75, 100]

base_estimator__max_features = [2,3,4,5,6]

base_estimator__max_depth = [1,2,3,4,5,6,7,8,9,10]

search_params = {'n_estimators': n_estimators,
                 'base_estimator__max_features': base_estimator__max_features,
                 'base_estimator__max_depth': base_estimator__max_depth}

#create train and test datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 100) 

In [None]:
#perform model hypertuning

model_v2 = GridSearchCV(model, search_params, cv=cv, n_jobs=-1)

model_v2.fit(X_train, y_train)

best_params = model_v2.best_params_

print('Best Parameters', model_v2.best_params_)


In [None]:
#train the hypetuned model
model_v3 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, max_depth=5, max_features=5), n_estimators = 75)
model_v3.fit(X_train, y_train)

#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=1)
 
print('Cross Validation accuracy scores: %s' % scores)
 
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))


In [None]:
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)

scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=1, scoring=make_scorer(cedric_metric_binary, greater_is_better=True))
 
print('Cross Validation custom metric scores: %s' % scores)
 
print('Cross Validation custom metric: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

In [None]:
#calculate feature importances
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(model_v3.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)
print('Importances')
print(importances)
print('\n')  
    
#print classification report
print('Classification report')
predict_val = model_v3.predict(X_val)
print(classification_report(y_val, predict_val))
print('\n')
    
#print normalized confusion matrix   
matrix = plot_confusion_matrix(model_v3, X_val, y_val, cmap=plt.cm.Blues, normalize='true')
matrix.ax_.set_title('Confusion Matrix')
fig = plt.gcf()
fig.set_size_inches(7, 7)
plt.show()

#custom metric

print('Custom metric')
print(cedric_metric_binary(y_val, predict_val))



## Ada Boost 3 - Multiclass with original features
#### Features = 'capacity', 'vib1_x', 'vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z', 'amp_1', 'amp_2', 'mic1', 'mic2'
#### Value = 'type_cat'

In [None]:
drop_list1 = ['type', 'capacity_cat', 'pump_cat',
                'anomaly', 'anomaly_cat','anomaly_binary', 'datetime', 'date', 'time',
                'vib1','vib2','amp_uni', 'pump', 'anomaly_binary_cat']
value = 'type_cat'

In [None]:
#drop the columns we don't need
df = data.drop(drop_list1, axis=1)

#define features and value
X = df.drop(value, axis=1)
y = df[value]
       
    
#create train and test datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 100)
    
#train the model
model = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100))
    
model.fit(X_train,y_train)
    
    
#calculate accuracy
predict_train = model.predict(X_train)
predict_val = model.predict(X_val)
accuracy_train = accuracy_score(y_train,predict_train)
print("Accuracies")
print('accuracy_score on train dataset : ', accuracy_train)
accuracy_val = accuracy_score(y_val,predict_val)
print('accuracy_score on test dataset : ', accuracy_val)
print('\n')
    
    
#calculate feature importances
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(model.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)
print('Importances')
print(importances)
print('\n')

#custom metric

print('Custom metric')
print(cedric_metric_nonbinary(y_val, predict_val))

In [None]:
#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
 
print('Cross Validation accuracy scores: %s' % scores)
 
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_nonbinary, greater_is_better=True))
 
print('Cross Validation custom metric scores: %s' % scores)
 
print('Cross Validation custom metric: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

In [None]:
#start model hypertuning

n_estimators = [50, 75, 100]

base_estimator__max_features = [2,4,6, 8, 10]

base_estimator__max_depth = [3,5,7, 9]

criterion = ['gini', 'entropy']

search_params = {'n_estimators': n_estimators,
                 'base_estimator__max_features': base_estimator__max_features,
                 'base_estimator__max_depth': base_estimator__max_depth,
                 'base_estimator__criterion': criterion,}

#create train and test datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 100) 

In [None]:
#perform model hypertuning

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)

model_v2 = GridSearchCV(model, search_params, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_nonbinary, greater_is_better=True))

model_v2.fit(X_train, y_train)

best_params = model_v2.best_params_

print('Best Parameters', model_v2.best_params_)

In [None]:
#train the hypetuned model

model_v3 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, max_depth=5, max_features=4, criterion='entropy'), n_estimators=50)

model_v3.fit(X_train, y_train)

scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=-1)
print('Cross Validation accuracy scores: %s' % scores)
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))
print('\n')

scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_nonbinary, greater_is_better=True))
print('Cross Validation custom metric scores: %s' % scores)
print('Cross Validation custom metric: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

## Ada Boost 4 - Binary with original features
#### Features = 'capacity', 'vib1_x', 'vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z', 'amp_1', 'amp_2', 'mic1', 'mic2'
#### Value = 'anomaly_binary'

In [None]:
drop_list2 = ['type', 'capacity_cat', 'pump_cat',
                'anomaly', 'anomaly_cat','type_cat', 'datetime', 'date', 'time',
                'vib1','vib2','amp_uni', 'pump', 'anomaly_binary_cat']
value = 'anomaly_binary'

In [None]:
#drop the columns we don't need
df = data.drop(drop_list2, axis=1)

#define features and value
X = df.drop(value, axis=1)
y = df[value]
       
    
#create train and test datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 100)
    
#train the model
model = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100))
    
model.fit(X_train,y_train)
    
    
#calculate accuracy
predict_train = model.predict(X_train)
predict_val = model.predict(X_val)
accuracy_train = accuracy_score(y_train,predict_train)
print("Accuracies")
print('accuracy_score on train dataset : ', accuracy_train)
accuracy_val = accuracy_score(y_val,predict_val)
print('accuracy_score on test dataset : ', accuracy_val)
print('\n')
    
    
#calculate feature importances
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(model.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)
print('Importances')
print(importances)
print('\n')

#custom metric

print('Custom metric')
print(cedric_metric_binary(y_val, predict_val))

In [None]:
#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
 
print('Cross Validation accuracy scores: %s' % scores)
 
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_binary, greater_is_better=True))
 
print('Cross Validation custom metric scores: %s' % scores)
 
print('Cross Validation custom metric: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

In [None]:
#start model hypertuning

n_estimators = [50, 75, 100]

base_estimator__max_features = [2,4,6, 8, 10]

base_estimator__max_depth = [3,5,7, 9]

criterion = ['gini', 'entropy']

search_params = {'n_estimators': n_estimators,
                 'base_estimator__max_features': base_estimator__max_features,
                 'base_estimator__max_depth': base_estimator__max_depth,
                 'base_estimator__criterion': criterion,}

#create train and test datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 100) 

In [None]:
#perform model hypertuning

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)

model_v2 = GridSearchCV(model, search_params, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_binary, greater_is_better=True))

model_v2.fit(X_train, y_train)

best_params = model_v2.best_params_

print('Best Parameters', model_v2.best_params_)

In [None]:
#train the hypetuned model

model_v3 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, max_depth=3, max_features=2, criterion='gini'), n_estimators=50)

model_v3.fit(X_train, y_train)

scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=-1)
print('Cross Validation accuracy scores: %s' % scores)
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))
print('\n')

scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_binary, greater_is_better=True))
print('Cross Validation custom metric scores: %s' % scores)
print('Cross Validation custom metric: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))