In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble  import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score, mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn import tree
from sklearn.preprocessing import StandardScaler


import sys
sys.path.insert(0, '../src/features/')
from custom_metric import *

pd.set_option("display.max.columns", None)

In [None]:
#read in the data

data = pd.read_csv('../data/clean_data.csv')
data.columns

## Random Forest 1 - Multiclass
#### Features = 'capacity', 'vib1', 'vib2', 'amp_uni', 'mic1', 'mic2'
#### Value = 'type_cat'

In [None]:
drop_list = ['type', 'capacity_cat', 'pump_cat',
                'anomaly', 'anomaly_cat','anomaly_binary', 'datetime', 'date', 'time',
                'vib1_x', 'vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z', 'amp1', 'amp2', 'pump', 'anomaly_binary_cat']

value = 'type_cat'

In [None]:
#drop the columns we don't need
df = data.drop(drop_list, axis=1)
    
#define features and value
X = df.drop(value, axis=1)
y = df[value]
    
    
#create train and test datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 100)
    
#train the model
model = RandomForestClassifier(random_state = 100)
    
model.fit(X_train,y_train)
    
    
#calculate accuracy
predict_train = model.predict(X_train)
predict_val = model.predict(X_val)
accuracy_train = accuracy_score(y_train,predict_train)
print("Accuracies")
print('accuracy_score on train dataset : ', accuracy_train)
accuracy_val = accuracy_score(y_val,predict_val)
print('accuracy_score on test dataset : ', accuracy_val)
print('\n')
    
    
#calculate feature importances
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(model.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)
print('Importances')
print(importances)
print('\n')


In [None]:
model.get_params()

In [None]:
#model hypertuning

# Number of trees in random forest
n_estimators = [50,75,100]

# Number of features to consider at every split
max_features = [2,3,4,5,6]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 20, 40, 60, 80, 100]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the grid
search_params = {'n_estimators': n_estimators,
                 'max_features': max_features,
                 'min_samples_split': min_samples_split,
                 'bootstrap': bootstrap}

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)

#perform model hypertuning

model_v2 = GridSearchCV(model, search_params, cv=cv, n_jobs=-1)

model_v2.fit(X_train, y_train)

best_params = model_v2.best_params_

print('Best Parameters: ', model_v2.best_params_)

print('Best score: ', model_v2.best_score_)

In [None]:
#train the hypetuned model

model_v3 = RandomForestClassifier(random_state = 100, bootstrap = 'False', min_samples_split=5, max_features=2, n_estimators = 75)
model_v3.fit(X_train, y_train)

#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=-1)
 
print('Cross Validation accuracy scores: %s' % scores)
 
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

In [None]:
#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_nonbinary, greater_is_better=True))
 
print('Cross Validation accuracy scores: %s' % scores)
 
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

In [None]:
#calculate feature importances
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(model_v3.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)
print('Importances')
print(importances)
print('\n')  
    
#print classification report
print('Classification report')
predict_val = model_v3.predict(X_val)
print(classification_report(y_val, predict_val))
print('\n')
    
#print normalized confusion matrix   
matrix = plot_confusion_matrix(model_v3, X_val, y_val, cmap=plt.cm.Blues, normalize='true')
matrix.ax_.set_title('Confusion Matrix')
fig = plt.gcf()
fig.set_size_inches(7, 7)
plt.show()


In [None]:
# As our custom metric is small for this model, let's try another hypertuning

# Number of trees in random forest
n_estimators = [10,15,20,25,30,35,50]

# Number of features to consider at every split
max_features = [2,3,4,5,6]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 20, 25, 30, 50]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
search_params = {'n_estimators': n_estimators,
                 'max_features': max_features,
                 'min_samples_split': min_samples_split,
                 'bootstrap': bootstrap}

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)

#perform model hypertuning

model_v2 = GridSearchCV(model, search_params, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_nonbinary, greater_is_better=True))

model_v2.fit(X_train, y_train)

best_params = model_v2.best_params_

print('Best Parameters: ', model_v2.best_params_)


In [None]:
#train the hypetuned model

model_v3 = RandomForestClassifier(random_state = 100, bootstrap = 'False', min_samples_split=5, max_features=2, n_estimators = 35)
model_v3.fit(X_train, y_train)

#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=-1)
 
print('Cross Validation accuracy scores: %s' % scores)
 
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

In [None]:
#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_nonbinary, greater_is_better=True))
 
print('Cross Validation accuracy scores: %s' % scores)
 
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

## Random forests 2 - Binary
#### Features = 'capacity', 'vib1', 'vib2', 'amp_uni', 'mic1', 'mic2'
#### Value = 'anomaly_binary'

In [None]:
drop_list = ['Unnamed: 0', 'type', 'capacity_cat', 'pump_cat',
                'anomaly', 'anomaly_cat','type_cat', 'datetime', 'date', 'time',
                'vib1_x', 'vib1_y', 'vib1_z', 'vib2_x', 'vib2_y', 'vib2_z', 'amp1', 'amp2', 'pump', 'anomaly_binary_cat']


value = 'anomaly_binary'

In [None]:
#drop the columns we don't need
df = data.drop(drop_list, axis=1)
    
#define features and value
X = df.drop(value, axis=1)
y = df[value]
    
    
#create train and test datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 100)
    
#train the model
model = RandomForestClassifier(random_state = 100)
    
model.fit(X_train,y_train)
    
    
#calculate accuracy
predict_train = model.predict(X_train)
predict_val = model.predict(X_val)
accuracy_train = accuracy_score(y_train,predict_train)
print("Accuracies")
print('accuracy_score on train dataset : ', accuracy_train)
accuracy_val = accuracy_score(y_val,predict_val)
print('accuracy_score on test dataset : ', accuracy_val)
print('\n')
    
    
#calculate feature importances
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(model.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)
print('Importances')
print(importances)
print('\n')


In [None]:
model.get_params()

In [None]:
#model hypertuning

# Number of trees in random forest
n_estimators = [50,75,100]

# Number of features to consider at every split
max_features = [2,3,4,5,6]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 20, 50]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the grid
search_params = {'n_estimators': n_estimators,
                 'max_features': max_features,
                 'min_samples_split': min_samples_split,
                 'bootstrap': bootstrap}

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)

#perform model hypertuning

model_v2 = GridSearchCV(model, search_params, cv=cv, n_jobs=-1)

model_v2.fit(X_train, y_train)

best_params = model_v2.best_params_

print('Best Parameters: ', model_v2.best_params_)

print('Best score: ', model_v2.best_score_)

In [None]:
#train the hypetuned model

model_v3 = RandomForestClassifier(random_state = 100, bootstrap = 'False', min_samples_split=5, max_features=2, n_estimators = 50)
model_v3.fit(X_train, y_train)

#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=-1)
 
print('Cross Validation accuracy scores: %s' % scores)
 
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

In [None]:
#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_binary, greater_is_better=True))
 
print('Cross Validation accuracy scores: %s' % scores)
 
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

In [None]:
#calculate feature importances
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(model_v3.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)
print('Importances')
print(importances)
print('\n')  
    
#print classification report
print('Classification report')
predict_val = model_v3.predict(X_val)
print(classification_report(y_val, predict_val))
print('\n')
    
#print normalized confusion matrix   
matrix = plot_confusion_matrix(model_v3, X_val, y_val, cmap=plt.cm.Blues, normalize='true')
matrix.ax_.set_title('Confusion Matrix')
fig = plt.gcf()
fig.set_size_inches(7, 7)
plt.show()

In [None]:
# As our custom metric is mall for this model, let's try another hypertuning

# Number of trees in random forest
n_estimators = [10,15,20,25,30,35,50]

# Number of features to consider at every split
max_features = [2,3,4,5,6]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 20, 25, 30, 50]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the grid
search_params = {'n_estimators': n_estimators,
                 'max_features': max_features,
                 'min_samples_split': min_samples_split,
                 'bootstrap': bootstrap}

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)

#perform model hypertuning

model_v2 = GridSearchCV(model, search_params, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_binary, greater_is_better=True))

model_v2.fit(X_train, y_train)

best_params = model_v2.best_params_

print('Best Parameters: ', model_v2.best_params_)

print('Best score: ', model_v2.best_score_)

In [None]:
#train the hypetuned model

model_v3 = RandomForestClassifier(random_state = 100, bootstrap = 'False', min_samples_split=20, max_features=2, n_estimators = 25)
model_v3.fit(X_train, y_train)

#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=-1)
 
print('Cross Validation accuracy scores: %s' % scores)
 
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

In [None]:
#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_binary, greater_is_better=True))
 
print('Cross Validation accuracy scores: %s' % scores)
 
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

## Random forests 3 - Binary with original features
#### Features = 'capacity', 'vib1_x', 'vib1_y', 'vib1_z', 'vib2_x','vib2_y', 'vib2_z', 'amp_1', 'amp_2', 'mic1', 'mic2'
#### Value = 'anomaly_binary'

In [None]:
drop_list6 = ['type', 'type_cat', 'capacity_cat', 'pump_cat',
                'anomaly', 'anomaly_cat', 'datetime', 'date', 'time',
                'vib1','vib2', 'amp_uni', 'pump', 'anomaly_binary_cat']

value = 'anomaly_binary'

In [None]:
#drop the columns we don't need
df = data.drop(drop_list6, axis=1)
    
#define features and value
X = df.drop(value, axis=1)
y = df[value]
    
    
#create train and test datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 100)
    
#train the model
model = RandomForestClassifier(random_state = 100)
    
model.fit(X_train,y_train)
    
    
#calculate accuracy
predict_train = model.predict(X_train)
predict_val = model.predict(X_val)
accuracy_train = accuracy_score(y_train,predict_train)
print("Accuracies")
print('accuracy_score on train dataset : ', accuracy_train)
accuracy_val = accuracy_score(y_val,predict_val)
print('accuracy_score on test dataset : ', accuracy_val)
print('\n')
    
    
#calculate feature importances
importances = pd.DataFrame({'feature':X_train.columns,'importance':np.round(model.feature_importances_,3)})
importances = importances.sort_values('importance',ascending=False)
print('Importances')
print(importances)
print('\n')


In [None]:
#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
 
print('Cross Validation accuracy scores: %s' % scores)
 
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_binary, greater_is_better=True))
 
print('Cross Validation custom metric scores: %s' % scores)
 
print('Cross Validation custom metric: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

In [None]:
#Start hypertuning

# Number of trees in random forest
n_estimators = [10,20,30,30,50]

# Number of features to consider at every split
max_features = [2,3,4,5,6]

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 20, 50]

# Method of selecting samples for training each tree
bootstrap = [True, False]

criterion = ['gini', 'entropy']

weights = [{1: 10}, {1:100}, {1:1000}]

max_depth = [5, 10]

# Create the grid
search_params = {'n_estimators': n_estimators,
                 'max_features': max_features,
                 'min_samples_split': min_samples_split,
                 'bootstrap': bootstrap,
                 'max_depth': max_depth,
                 'criterion': criterion,
                 'class_weight': weights}

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)

#perform model hypertuning

model_v2 = GridSearchCV(model, search_params, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_binary, greater_is_better=True))

model_v2.fit(X_train, y_train)

best_params = model_v2.best_params_

print('Best Parameters: ', model_v2.best_params_)

print('Best score: ', model_v2.best_score_)

In [None]:
#train the hypetuned model

model_v3 = RandomForestClassifier(random_state = 100, bootstrap = 'False', class_weight={1:100}, min_samples_split=20, max_features=4, n_estimators = 30, criterion='entropy', max_depth=10)
model_v3.fit(X_train, y_train)

#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=-1)
 
print('Cross Validation accuracy scores: %s' % scores)
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_binary, greater_is_better=True))
 
print('Cross Validation custom metric scores: %s' % scores)
print('Cross Validation custom metric: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

## Random forests 4 - Multiclass with original features
#### Features = 'capacity', 'vib1_x', 'vib1_y', 'vib1_z', 'vib2_x','vib2_y', 'vib2_z', 'amp_1', 'amp_2', 'mic1', 'mic2'
#### Value = 'type_cat'

In [None]:
drop_list4 = ['type', 'anomaly_binary', 'capacity_cat', 'pump_cat',
                'anomaly', 'anomaly_cat', 'datetime', 'date', 'time',
                'vib1','vib2', 'amp_uni', 'pump', 'anomaly_binary_cat']

value = 'type_cat'

In [None]:
#drop the columns we don't need
df = data.drop(drop_list4, axis=1)
    
#define features and value
X = df.drop(value, axis=1)
y = df[value]

#scale the data
scaler = StandardScaler()  
X = scaler.fit_transform(X)
    
    
#create train and test datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 100)
    
#train the model
model = RandomForestClassifier(random_state = 100)
    
model.fit(X_train,y_train)
    
    
#calculate accuracy
predict_train = model.predict(X_train)
predict_val = model.predict(X_val)
accuracy_train = accuracy_score(y_train,predict_train)
print("Accuracies")
print('accuracy_score on train dataset : ', accuracy_train)
accuracy_val = accuracy_score(y_val,predict_val)
print('accuracy_score on test dataset : ', accuracy_val)
print('\n')


In [None]:
#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1)
 
print('Cross Validation accuracy scores: %s' % scores)
 
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

scores = cross_val_score(model, X, y, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_nonbinary, greater_is_better=True))
 
print('Cross Validation custom metric scores: %s' % scores)
 
print('Cross Validation custom metric: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

In [None]:
#Start hypertuning

# Number of trees in random forest
n_estimators = [20,30,50,75]

# Number of features to consider at every split
max_features = [2,4,6]

# Minimum number of samples required to split a node
min_samples_split = [5, 10, 20, 50]

# Method of selecting samples for training each tree
bootstrap = [True, False]

criterion = ['gini', 'entropy']

weights = [{1:1000, 2:1000, 3:1000}, {1:3000, 2:3000, 3:3000}, {1:5000, 2:5000, 3:5000}, {1:7000, 2:7000, 3:7000}, {1:10000, 2:10000, 3:10000}]

max_depth = [5, 10]

# Create the random grid
search_params = {'n_estimators': n_estimators,
                 'max_features': max_features,
                 'min_samples_split': min_samples_split,
                 'bootstrap': bootstrap,
                 'max_depth': max_depth,
                 'criterion': criterion,
                 'class_weight': weights}

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)

#perform model hypertuning

model_v2 = GridSearchCV(model, search_params, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_nonbinary, greater_is_better=True))

model_v2.fit(X_train, y_train)

best_params = model_v2.best_params_

print('Best Parameters: ', model_v2.best_params_)

print('Best score: ', model_v2.best_score_)

In [None]:
#train the hypetuned model

model_v3 = RandomForestClassifier(random_state = 100, bootstrap = 'False', class_weight={1:7000, 2:7000, 3:7000}, min_samples_split=10, max_features=2, n_estimators = 50, criterion='entropy', max_depth=10)
model_v3.fit(X_train, y_train)

#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=-1)
 
print('Cross Validation accuracy scores: %s' % scores)
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

#perform cross validation
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
scores = cross_val_score(model_v3, X, y, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_nonbinary, greater_is_better=True))
 
print('Cross Validation custom metric scores: %s' % scores)
print('Cross Validation custom metric: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))