In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble  import AdaBoostClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score, mean_squared_error, r2_score, accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix, make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.preprocessing import StandardScaler 


import sys
sys.path.insert(0, '../src/features/')
from custom_metric import *


pd.set_option("display.max.columns", None)

In [None]:
levels = [1, 2, 3]

## Ada Boost 1
#### Features = 'capacity', 'vib1', 'vib2', 'amp_uni', 'mic1', 'mic2'
#### Value = 'type_cat'

In [None]:
drop_list = ['Unnamed: 0', 'datetime', 'date', 'time', 'pump', 'pump_cat', 'capacity_cat', 'anomaly', 'anomaly_cat', 'anomaly_binary',
       'anomaly_binary_cat', 'type', 'vib1', 'vib1_x', 'vib1_y',
       'vib1_z', 'vib2', 'vib2_x', 'vib2_y', 'vib2_z', 'amp1', 'amp2',
       'amp_uni', 'mic1', 'mic2',
       'vib1_x_noise', 'vib1_x_with_noise', 'vib1_y_noise',
       'vib1_y_with_noise', 'vib1_z_noise', 'vib1_z_with_noise', 'vib2_x_noise', 'vib2_x_with_noise', 'vib2_y_noise',
       'vib2_y_with_noise', 'vib2_z_noise', 'vib2_z_with_noise', 'amp1_noise',
       'amp1_with_noise', 'amp2_noise', 'amp2_with_noise', 'mic1_noise', 'mic2_noise']

value = 'type_cat'

### Firstly, let's just train a model previosly hypertuned for clean data and see how the metrics change with change in noise level.

In [None]:
accuracies = {}
custom_metrics = {}

for level in levels:
    
    #read in data
    data = pd.read_csv('../data/noisy_data_'+str(level)+'.csv')
    
    #drop the columns we don't need
    df = data.drop(drop_list, axis=1)
    
    #define features and value
    X = df.drop(value, axis=1)
    y = df[value]
    
    #create train and test datasets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 100)
    
    #train the model
    model = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, max_depth=7, max_features=3), n_estimators = 75)
    model.fit(X_train, y_train)
    
    #calculate accuracy and custom metric
    
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
    scores1 = cross_val_score(model, X, y, cv=cv, n_jobs=1)
    accuracies[level] = scores1
    
    scores2 = cross_val_score(model, X, y, cv=cv, n_jobs=1, scoring=make_scorer(cedric_metric_nonbinary, greater_is_better=True))
    custom_metrics[level] = scores2
    
print(accuracies)
print(custom_metrics)
 

In [None]:
for level in levels:
    print(accuracies[level].mean())
    print(custom_metrics[level].mean())

In [None]:
noise_levels = accuracies.keys()
mean_accuracies = [accuracies[n].mean() for n in levels]
# show range of scores using min and max values
lower_lim = [accuracies[n].min() for n in levels]
upper_lim = [accuracies[n].max() for n in levels]

plt.plot(noise_levels, mean_accuracies)
plt.fill_between(noise_levels, lower_lim, upper_lim, alpha=0.3, color='gray')

plt.title(f'AdaBoost (for {value}) accuracies for different noise levels', fontsize=16)
plt.xlabel('noise level', fontsize=14)
plt.ylabel('CV accuracy score (mean score / range)', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

fig = plt.gcf()
fig.set_size_inches(12, 6)

#plt.show()
plt.savefig('adaboost_noise.jpeg')

In [None]:
noise_levels = custom_metrics.keys()
mean_accuracies = [custom_metrics[n].mean() for n in levels]
# show range of scores using min and max values
lower_lim = [custom_metrics[n].min() for n in levels]
upper_lim = [custom_metrics[n].max() for n in levels]

plt.plot(noise_levels, mean_accuracies)
plt.fill_between(noise_levels, lower_lim, upper_lim, alpha=0.3, color='gray')

plt.title(f'AdaBoost (for {value}) custom metrics for different noise levels', fontsize=16)
plt.xlabel('noise level', fontsize=14)
plt.ylabel('CV custom metric score (mean score / range)', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

fig = plt.gcf()
fig.set_size_inches(12, 6)

#plt.show()
plt.savefig('adaboost_noise.jpeg')

### Now, let's try hypertuning the models for each noise level separately.

In [None]:
#Let's prepare the dataframes for all noise levels

for level in levels:
    
    #read in data
    globals()[f'data_{level}'] = pd.read_csv('../data/noisy_data_'+str(level)+'.csv')
    
    #drop the columns we don't need
    globals()[f'df_{level}'] = globals()[f'data_{level}'].drop(drop_list, axis=1)
    
    #define features and value
    globals()[f'X_{level}'] = globals()[f'df_{level}'].drop(value, axis=1)
    globals()[f'y_{level}'] = globals()[f'df_{level}'][value]
    
    #scale the data
    scaler = StandardScaler()  
    globals()[f'X_{level}'] = scaler.fit_transform(globals()[f'X_{level}']) 
    
    #create train and test datasets
    globals()[f'X_{level}_train'], globals()[f'X_{level}_val'], globals()[f'y_{level}_train'], globals()[f'y_{level}_val'] = train_test_split(globals()[f'X_{level}'], globals()[f'y_{level}'], test_size = 0.20, random_state = 100)
    

# Define parameters for hypertuning

n_estimators = [50, 75, 100]

base_estimator__max_features = [2,4,6]

base_estimator__max_depth = [1,3,5,7,9,11]

weights = [{1: 100, 2: 100, 3: 100},
           {1: 300, 2: 300, 3: 300},
           {1: 1000, 2: 1000, 3: 1000},
           {1: 3000, 2: 3000, 3: 3000}]

l_rate = [1, 2, 3]

min_s_leaf = [1, 5, 10, 15]

min_s_split = [1, 5, 10, 15]

criterion = ['gini', 'entropy']

search_params = {'n_estimators': n_estimators,
                 'base_estimator__max_features': base_estimator__max_features,
                 'base_estimator__max_depth': base_estimator__max_depth,
                 'base_estimator__class_weight': weights,
                 'learning_rate': l_rate,
                 'base_estimator__min_samples_leaf': min_s_leaf,
                 'base_estimator__min_samples_split': min_s_split,
                 'base_estimator__criterion': criterion}

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)


### Noise level 1

In [None]:
#train the model
model_1 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100))

model_1.fit(X_1_train, y_1_train)


#perform hypertuning
model_1_v2 = GridSearchCV(model_1, search_params, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_nonbinary, greater_is_better=True))

model_1_v2.fit(X_1_train, y_1_train)

best_params = model_v2.best_params_ #there's a bug in here, but do not want to restart the hypertuning

print('Best Parameters', model_1_v2.best_params_)


In [None]:
best_params = model_1_v2.best_params_

print('Best Parameters', model_1_v2.best_params_)

In [None]:
#train the hypetuned model

#model_1_v3 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, class_weight={1: 1000, 2: 1000, 3: 1000}, criterion='gini', max_depth=1, max_features=6, min_samples_leaf=15, min_samples_split=5), n_estimators=75, learning_rate=2)
model_1_v3 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, max_depth=7, max_features=3, class_weight={1: 1000, 2: 1000, 3: 1000}), n_estimators = 75)


model_1_v3.fit(X_1_train, y_1_train)

scores = cross_val_score(model_1_v3, X_1, y_1, cv=cv, n_jobs=-1)
print('Cross Validation accuracy scores: %s' % scores)
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))
print('\n')

scores = cross_val_score(model_1_v3, X_1, y_1, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_nonbinary, greater_is_better=True))
print('Cross Validation custom metric scores: %s' % scores)
print('Cross Validation custom metric: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

### Noise level 2

In [None]:
#train the hypetuned model

#model_2_v3 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, class_weight={1: 1000, 2: 1000, 3: 1000}, criterion='gini', max_depth=1, max_features=6, min_samples_leaf=15, min_samples_split=5), n_estimators=75, learning_rate=2)
model_2_v3 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, max_depth=7, max_features=3, class_weight={1: 1000, 2: 1000, 3: 1000}), n_estimators = 75)


model_2_v3.fit(X_2_train, y_2_train)

scores = cross_val_score(model_2_v3, X_2, y_2, cv=cv, n_jobs=-1)
print('Cross Validation accuracy scores: %s' % scores)
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))
print('\n')

scores = cross_val_score(model_2_v3, X_2, y_2, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_nonbinary, greater_is_better=True))
print('Cross Validation custom metric scores: %s' % scores)
print('Cross Validation custom metric: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

### Noise level 3

In [None]:
#train the hypetuned model

#model_3_v3 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, class_weight={1: 1000, 2: 1000, 3: 1000}, criterion='gini', max_depth=1, max_features=6, min_samples_leaf=15, min_samples_split=5), n_estimators=75, learning_rate=2)
model_3_v3 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, max_depth=7, max_features=3, class_weight={1: 1000, 2: 1000, 3: 1000}), n_estimators = 75)

model_3_v3.fit(X_3_train, y_3_train)

scores = cross_val_score(model_3_v3, X_3, y_3, cv=cv, n_jobs=-1)
print('Cross Validation accuracy scores: %s' % scores)
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))
print('\n')

scores = cross_val_score(model_3_v3, X_3, y_3, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_nonbinary, greater_is_better=True))
print('Cross Validation custom metric scores: %s' % scores)
print('Cross Validation custom metric: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

## Ada Boost 2
#### Features = 'capacity', 'vib1', 'vib2', 'amp_uni', 'mic1', 'mic2'
#### Value = 'anomaly_binary'

In [None]:
drop_list = ['Unnamed: 0', 'datetime', 'date', 'time', 'pump', 'pump_cat', 'capacity_cat', 'anomaly', 'anomaly_cat', 'type_cat',
       'anomaly_binary_cat', 'type', 'vib1', 'vib1_x', 'vib1_y',
       'vib1_z', 'vib2', 'vib2_x', 'vib2_y', 'vib2_z', 'amp1', 'amp2',
       'amp_uni', 'mic1', 'mic2',
       'vib1_x_noise', 'vib1_x_with_noise', 'vib1_y_noise',
       'vib1_y_with_noise', 'vib1_z_noise', 'vib1_z_with_noise', 'vib2_x_noise', 'vib2_x_with_noise', 'vib2_y_noise',
       'vib2_y_with_noise', 'vib2_z_noise', 'vib2_z_with_noise', 'amp1_noise',
       'amp1_with_noise', 'amp2_noise', 'amp2_with_noise', 'mic1_noise', 'mic2_noise']

value = 'anomaly_binary'

### Firstly, let's just train a model previosly hypertuned for clean data and see how the metrics change with change in noise level.

In [None]:
accuracies = {}
custom_metrics = {}

for level in levels:
    
    #read in data
    data = pd.read_csv('../data/noisy_data_'+str(level)+'.csv')
    
    #drop the columns we don't need
    df = data.drop(drop_list, axis=1)
    
    #define features and value
    X = df.drop(value, axis=1)
    y = df[value]
    
    #create train and test datasets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.20, random_state = 100)
    
    #train the model
    model = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, max_depth=5, max_features=5), n_estimators = 75)
    model.fit(X_train, y_train)
    
    #calculate accuracy and custom metric
    
    cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)
    scores1 = cross_val_score(model, X, y, cv=cv, n_jobs=1)
    accuracies[level] = scores1
    
    scores2 = cross_val_score(model, X, y, cv=cv, n_jobs=1, scoring=make_scorer(cedric_metric_binary, greater_is_better=True))
    custom_metrics[level] = scores2
    
print(accuracies)
print(custom_metrics)

In [None]:
for level in levels:
    print(accuracies[level].mean())
    print(custom_metrics[level].mean())

In [None]:
noise_levels = accuracies.keys()
mean_accuracies = [accuracies[n].mean() for n in levels]
# show range of scores using min and max values
lower_lim = [accuracies[n].min() for n in levels]
upper_lim = [accuracies[n].max() for n in levels]

plt.plot(noise_levels, mean_accuracies)
plt.fill_between(noise_levels, lower_lim, upper_lim, alpha=0.3, color='gray')

plt.title(f'AdaBoost (for {value}) accuracies for different noise levels', fontsize=16)
plt.xlabel('noise level', fontsize=14)
plt.ylabel('CV accuracy score (mean score / range)', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

fig = plt.gcf()
fig.set_size_inches(12, 6)

#plt.show()
plt.savefig('adaboost_noise.jpeg')

In [None]:
noise_levels = custom_metrics.keys()
mean_accuracies = [custom_metrics[n].mean() for n in levels]
# show range of scores using min and max values
lower_lim = [custom_metrics[n].min() for n in levels]
upper_lim = [custom_metrics[n].max() for n in levels]

plt.plot(noise_levels, mean_accuracies)
plt.fill_between(noise_levels, lower_lim, upper_lim, alpha=0.3, color='gray')

plt.title(f'AdaBoost (for {value}) custom metrics for different noise levels', fontsize=16)
plt.xlabel('noise level', fontsize=14)
plt.ylabel('CV custom metric score (mean score / range)', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)

fig = plt.gcf()
fig.set_size_inches(12, 6)

#plt.show()
plt.savefig('adaboost_noise.jpeg')

### Now, let's try hypertuning the models for each noise level separately.

In [None]:
#Let's prepare the dataframes for all noise levels

for level in levels:
    
    #read in data
    globals()[f'data_{level}'] = pd.read_csv('../data/noisy_data_'+str(level)+'.csv')
    
    #drop the columns we don't need
    globals()[f'df_{level}'] = globals()[f'data_{level}'].drop(drop_list, axis=1)
    
    #define features and value
    globals()[f'X_{level}'] = globals()[f'df_{level}'].drop(value, axis=1)
    globals()[f'y_{level}'] = globals()[f'df_{level}'][value]
    
    #scale the data
    scaler = StandardScaler()  
    globals()[f'X_{level}'] = scaler.fit_transform(globals()[f'X_{level}']) 
    
    #create train and test datasets
    globals()[f'X_{level}_train'], globals()[f'X_{level}_val'], globals()[f'y_{level}_train'], globals()[f'y_{level}_val'] = train_test_split(globals()[f'X_{level}'], globals()[f'y_{level}'], test_size = 0.20, random_state = 100)
    

# Define parameters for hypertuning

n_estimators = [50, 75, 100]

base_estimator__max_features = [2,4,6]

base_estimator__max_depth = [3,5,7]

weights = [{1: 1000},
           {1: 3000},
           {1: 5000}]

l_rate = [1, 2, 3]

criterion = ['gini', 'entropy']

search_params = {'n_estimators': n_estimators,
                 'base_estimator__max_features': base_estimator__max_features,
                 'base_estimator__max_depth': base_estimator__max_depth,
                 'base_estimator__class_weight': weights,
                 'learning_rate': l_rate,
                 'base_estimator__criterion': criterion}

cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=100)


### Noise level 1

In [None]:
#train the model
model_1 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100))

model_1.fit(X_1_train, y_1_train)


#perform hypertuning
model_1_v2 = GridSearchCV(model_1, search_params, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_binary, greater_is_better=True))

model_1_v2.fit(X_1_train, y_1_train)

best_params = model_1_v2.best_params_ 
print('Best Parameters', model_1_v2.best_params_)

In [None]:
#train the hypetuned model

model_1_v3 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, class_weight={1: 3000}, criterion='gini', max_depth=5, max_features=4), n_estimators=75, learning_rate=3)
#model_1_v3 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, max_depth=7, max_features=3, class_weight={1: 1000, 2: 1000, 3: 1000}), n_estimators = 75)


model_1_v3.fit(X_1_train, y_1_train)

scores = cross_val_score(model_1_v3, X_1, y_1, cv=cv, n_jobs=-1)
print('Cross Validation accuracy scores: %s' % scores)
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))
print('\n')

scores = cross_val_score(model_1_v3, X_1, y_1, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_binary, greater_is_better=True))
print('Cross Validation custom metric scores: %s' % scores)
print('Cross Validation custom metric: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

### Noise level 2

In [None]:
#train the model
model_2 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100))

model_2.fit(X_2_train, y_2_train)


#perform hypertuning
model_2_v2 = GridSearchCV(model_2, search_params, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_binary, greater_is_better=True))

model_2_v2.fit(X_2_train, y_2_train)

best_params = model_2_v2.best_params_ 
print('Best Parameters', model_2_v2.best_params_)

In [None]:
#train the hypetuned model

model_2_v3 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, class_weight={1: 3000}, criterion='gini', max_depth=5, max_features=4), n_estimators=75, learning_rate=3)
#model_1_v3 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, max_depth=7, max_features=3, class_weight={1: 1000, 2: 1000, 3: 1000}), n_estimators = 75)


model_2_v3.fit(X_1_train, y_1_train)

scores = cross_val_score(model_2_v3, X_2, y_2, cv=cv, n_jobs=-1)
print('Cross Validation accuracy scores: %s' % scores)
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))
print('\n')

scores = cross_val_score(model_2_v3, X_2, y_2, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_binary, greater_is_better=True))
print('Cross Validation custom metric scores: %s' % scores)
print('Cross Validation custom metric: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))

### Noise level 3

In [None]:
#train the model
model_3 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100))

model_3.fit(X_3_train, y_3_train)


#perform hypertuning
model_3_v2 = GridSearchCV(model_3, search_params, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_binary, greater_is_better=True))

model_3_v2.fit(X_3_train, y_3_train)

best_params = model_3_v2.best_params_ 
print('Best Parameters', model_2_v2.best_params_)

In [None]:
#train the hypetuned model

model_3_v3 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, class_weight={1: 3000}, criterion='gini', max_depth=5, max_features=4), n_estimators=50, learning_rate=3)
#model_3_v3 = AdaBoostClassifier(random_state = 100, base_estimator = DecisionTreeClassifier(random_state=100, max_depth=7, max_features=3, class_weight={1: 1000, 2: 1000, 3: 1000}), n_estimators = 75)


model_3_v3.fit(X_3_train, y_3_train)

scores = cross_val_score(model_3_v3, X_3, y_3, cv=cv, n_jobs=-1)
print('Cross Validation accuracy scores: %s' % scores)
print('Cross Validation accuracy: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))
print('\n')

scores = cross_val_score(model_3_v3, X_3, y_3, cv=cv, n_jobs=-1, scoring=make_scorer(cedric_metric_binary, greater_is_better=True))
print('Cross Validation custom metric scores: %s' % scores)
print('Cross Validation custom metric: %.3f +/- %.3f' % (np.mean(scores),np.std(scores)))