In [1]:
# Import dependencies

import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, average_precision_score
from imblearn.over_sampling import SMOTE, SMOTENC, BorderlineSMOTE, ADASYN
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV

from collections import Counter

Using TensorFlow backend.


In [2]:
# Load data

ds = pd.read_csv('TRN', sep='\t')

In [3]:
# Select columns

features = ds.columns.drop(['INDEX', 'IND_BOM_1_1', 'IND_BOM_1_2'])
X = ds[features]
y = ds['IND_BOM_1_1']

In [4]:
# Separate data for train, validation and test
# Train: 1/2
# Validation: 1/4
# Test: 1/4

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, stratify=y, random_state=43)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=.98, stratify=y_train, random_state=44)

In [5]:
cat_cols = [0, 1, 2, 3, 4, 5, 6, 8, 11, 12, 13, 14, 15, 16, 17, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 55, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242]
sm = BorderlineSMOTE(random_state=123, sampling_strategy='minority')
X_train, y_train = sm.fit_resample(X_train, y_train)

In [6]:
print(Counter(y_train))

Counter({1: 3826, 0: 3826})


In [7]:
X_train, y_train = shuffle(X_train, y_train)

In [8]:
# Utility functions for testing models

# Accuracy: (TP + TN) / N
# Precision: TP / (TP + FP)
# Recall: TP / (TP + FN)
# F1-Measure: Harmonic average between Precision and Recall

def compute_metrics(pred, pred_probs, y, neg_class=0):
    cm = confusion_matrix(y_true=y, y_pred=pred)
    tn, fp, fn, tp = cm.ravel()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    acc = (tp+tn) / (tp+tn+fp+fn)
    f_measure = f1_score(y, pred)
    
    pred_probs = np.array([v[0] if v[0] > v[1] else v[1] for v in pred_probs])
    roc_auc = roc_auc_score(y, pred_probs)
    pr_auc = average_precision_score(y, pred_probs)
    gd = generate_dist(pred_probs, np.array(y), neg_class)
    ks = ks_2samp(gd[0], gd[1])[0]
    return acc, precision, recall, f_measure, roc_auc, pr_auc, ks, cm

def report_performance_metrics(pred, pred_probs, _y, neg_class=0):
    acc, prec, rec, f_measure, roc_auc, pr_auc, ks, cm = compute_metrics(pred, pred_probs, _y, neg_class)
    print('Accuracy:', acc)
    print('Precision:', prec)
    print('Recall:', rec)
    print('F-Measure:', f_measure)
    print('AUROC:', roc_auc)
    print('AUPR:', pr_auc)
    print('Confusion Matrix:')
    print('KS test:', ks)
    print(cm)
    
def train_test_k_fold(k, clf, _X, _y, neg_class=0):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=11)
    it = 1
    for train_index, test_index in skf.split(_X, _y):
        X_batch = _X[train_index]
        y_batch = _y[train_index]
        X_test_batch = _X[test_index]
        y_test_batch = _y[test_index]
        clf.fit(X_batch, y_batch)
        results = clf.predict(X_test_batch)
        results_probs = clf.predict_proba(X_test_batch)
        print('K Fold it', it)
        report_performance_metrics(results, results_probs, y_test_batch, neg_class)
        print('')
        it += 1
        
def test_model(clf, _X, _y):
    results = clf.predict(_X)
    results_probs = clf.predict_proba(_X)
    report_performance_metrics(results, results_probs, _y)
    
def generate_dist(probas, y, neg_class):
    dist_1 = [0] * 100
    dist_2 = [0] * 100
    for_perc = Counter(y)
    for i in range(1,101):
        limiar = i/100        
        lower = y[np.where(probas <= limiar)]
        count = Counter(lower)
        dist_1[i-1] = count[neg_class]
        dist_2[i-1] = count[1]
    return np.array(dist_1)/for_perc[neg_class], np.array(dist_2)/for_perc[1]

In [9]:
#_X_train = np.delete(X_train, cat_cols, 1)
#kk = np.delete(X_val, cat_cols, 1)

In [10]:
'''params = {
    'kernel': ['rbf', 'linear', 'poly'],
    'C': [.0001, .01, 1, 10, 100],
    'gamma': [.0001, .001, .01, .1, 1, 10, 100],
}

clf = svm.SVC(decision_function_shape='ovo', verbose=True, max_iter=500)
grid = GridSearchCV(estimator=clf, param_grid=params)
search = grid.fit(X_train, y_train)
'''

"params = {\n    'kernel': ['rbf', 'linear', 'poly'],\n    'C': [.0001, .01, 1, 10, 100],\n    'gamma': [.0001, .001, .01, .1, 1, 10, 100],\n}\n\nclf = svm.SVC(decision_function_shape='ovo', verbose=True, max_iter=500)\ngrid = GridSearchCV(estimator=clf, param_grid=params)\nsearch = grid.fit(X_train, y_train)\n"

In [11]:
#params = search.best_params_
#params

In [12]:
# Train SVM with k fold 

params = {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}

svm_clf = svm.SVC(**params, decision_function_shape='ovo', 
              verbose=True, max_iter=200, probability=True)

_y_train = np.array([v if v == 1 else -1 for v in y_train])

train_test_k_fold(5, svm_clf, X_train, _y_train)

[LibSVM]



K Fold it 1
Accuracy: 0.610313315926893
Precision: 0.651705565529623
Recall: 0.47389033942558745
F-Measure: 0.5487528344671202
AUROC: 0.3712650573662647
AUPR: 0.4039976017926113
Confusion Matrix:
KS test: 1.0
[[572 194]
 [403 363]]

[LibSVM]



K Fold it 2
Accuracy: 0.6457516339869281
Precision: 0.6335329341317365
Recall: 0.6915032679738562
F-Measure: 0.66125
AUROC: 0.4256038275876799
AUPR: 0.42962050999648754
Confusion Matrix:
KS test: 1.0
[[459 306]
 [236 529]]

[LibSVM]



K Fold it 3
Accuracy: 0.5516339869281046
Precision: 0.7231638418079096
Recall: 0.16732026143790849
F-Measure: 0.27176220806794055
AUROC: 0.3330334486735871
AUPR: 0.3906227056666516
Confusion Matrix:
KS test: 1.0
[[716  49]
 [637 128]]

[LibSVM]



K Fold it 4
Accuracy: 0.6620915032679738
Precision: 0.6666666666666666
Recall: 0.6483660130718955
F-Measure: 0.6573889993373094
AUROC: 0.4191874919902602
AUPR: 0.43052437678679584
Confusion Matrix:
KS test: 1.0
[[517 248]
 [269 496]]

[LibSVM]



K Fold it 5
Accuracy: 0.6169934640522876
Precision: 0.6765285996055227
Recall: 0.44836601307189544
F-Measure: 0.5393081761006289
AUROC: 0.3415916955017301
AUPR: 0.39418064337313363
Confusion Matrix:
KS test: 1.0
[[601 164]
 [422 343]]





In [13]:
# Test SVM  on validation set

_y_val = np.array([v if v == 1 else -1 for v in y_val])
test_model(svm_clf, X_val, _y_val)

Accuracy: 0.5114441052837131
Precision: 0.694943334613907
Recall: 0.45383698685984886
F-Measure: 0.549088425565821
AUROC: 0.47351870478870123
AUPR: 0.6423656144249102
Confusion Matrix:
KS test: 1.0
[[20820 12705]
 [34831 28943]]




In [14]:
# Test SVM on test set

_y_test = np.array([v if v == 1 else -1 for v in y_test])
test_model(svm_clf, X_test, _y_test)

Accuracy: 0.5086205691113752
Precision: 0.6928995347460831
Recall: 0.4495727954431514
F-Measure: 0.5453239830245316
AUROC: 0.4771637838211466
AUPR: 0.645337652422363
Confusion Matrix:
KS test: 1.0
[[ 61202  37360]
 [103204  84294]]




In [15]:
# Define MLP ensemble (Bagging method) and train it

_mlp = MLPClassifier(hidden_layer_sizes= (12, 12), learning_rate_init= 0.005, solver= 'adam',
                     alpha=1e-4, verbose=True, activation='relu', batch_size=128, max_iter=20, tol=1e-7)

mlp_ensemble = BaggingClassifier(base_estimator=_mlp, n_estimators=5, max_samples=.2, 
                  bootstrap=False, bootstrap_features=True, n_jobs=8, verbose=True)

mlp_ensemble.fit(X_train, y_train)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    2.0s remaining:    3.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    2.5s finished


BaggingClassifier(base_estimator=MLPClassifier(activation='relu', alpha=0.0001,
                                               batch_size=128, beta_1=0.9,
                                               beta_2=0.999,
                                               early_stopping=False,
                                               epsilon=1e-08,
                                               hidden_layer_sizes=(12, 12),
                                               learning_rate='constant',
                                               learning_rate_init=0.005,
                                               max_iter=20, momentum=0.9,
                                               n_iter_no_change=10,
                                               nesterovs_momentum=True,
                                               power_t=0.5, random_state=None,
                                               shuffle=True, solver='adam',
                                               tol=1e-07,
    

In [16]:
# Test MLP ensemble on validation set

test_model(mlp_ensemble, X_val, y_val)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    0.5s remaining:    0.7s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    0.7s finished
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    0.5s remaining:    0.8s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    0.6s finished


Accuracy: 0.5938601630027031
Precision: 0.7395753086419753
Recall: 0.5870887822623639
F-Measure: 0.6545686588169477
AUROC: 0.5192103998770641
AUPR: 0.6827023913882746
Confusion Matrix:
KS test: 0.03
[[20341 13184]
 [26333 37441]]


In [17]:
# Test MLP Ensemble on test set 

test_model(mlp_ensemble, X_test, y_test)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    1.7s remaining:    2.5s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    1.9s finished
[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:    1.4s remaining:    2.1s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:    1.7s finished


Accuracy: 0.5957980843179752
Precision: 0.7408901997586808
Recall: 0.5894782877684028
F-Measure: 0.6565679967209024
AUROC: 0.5200043160558591
AUPR: 0.6824004773096084
Confusion Matrix:
KS test: 0.04
[[ 59908  38654]
 [ 76972 110526]]


In [18]:
# Run grid search to find best parameters
'''
    Best parameters found:
        Solver: adam
        hidden_layers: (12,12)
        learning_rate: 0.005
'''


'''solvers = ['lbfgs', 'adam',]
hidden_layers = [(12,12), (12,12,12), (5,5,5), (5,5)]
learning_rates = [.001, .0001, .005]

clf = MLPClassifier()
grid = GridSearchCV(estimator=clf, param_grid=dict(solver=solvers, hidden_layer_sizes=hidden_layers, learning_rate_init=learning_rates))
search = grid.fit(X_train, y_train)
'''

"solvers = ['lbfgs', 'adam',]\nhidden_layers = [(12,12), (12,12,12), (5,5,5), (5,5)]\nlearning_rates = [.001, .0001, .005]\n\nclf = MLPClassifier()\ngrid = GridSearchCV(estimator=clf, param_grid=dict(solver=solvers, hidden_layer_sizes=hidden_layers, learning_rate_init=learning_rates))\nsearch = grid.fit(X_train, y_train)\n"

In [19]:
# Rename best parameters found on grid search

#params = search.best_params_

params = {
    'solver': 'adam',
    'hidden_layer_sizes': (12,12),
    'learning_rate_init': .005,
}

In [20]:
# Train MLP with best parameters found on grid search
mlp = MLPClassifier(**params, alpha=1e-4, verbose=True, activation='relu', batch_size=128, max_iter=20, tol=1e-7)

# Uncomment below line to run k fold on the MLP
train_test_k_fold(5, mlp, X_train, y_train)
mlp.fit(X_train, y_train)

Iteration 1, loss = 0.68333623
Iteration 2, loss = 0.65684013
Iteration 3, loss = 0.63781253
Iteration 4, loss = 0.62808814
Iteration 5, loss = 0.62358680
Iteration 6, loss = 0.61743380
Iteration 7, loss = 0.61664109
Iteration 8, loss = 0.60945337
Iteration 9, loss = 0.60079372
Iteration 10, loss = 0.60183473
Iteration 11, loss = 0.60025061
Iteration 12, loss = 0.59713203
Iteration 13, loss = 0.59237457
Iteration 14, loss = 0.58755480
Iteration 15, loss = 0.58239075
Iteration 16, loss = 0.58453338
Iteration 17, loss = 0.58424987
Iteration 18, loss = 0.58069505
Iteration 19, loss = 0.58949189
Iteration 20, loss = 0.57302026
K Fold it 1
Accuracy: 0.6422976501305483
Precision: 0.653954802259887
Recall: 0.6044386422976501
F-Measure: 0.6282225237449117
AUROC: 0.6138582306785103
AUPR: 0.6508822355874017
Confusion Matrix:
KS test: 0.19
[[521 245]
 [303 463]]

Iteration 1, loss = 0.70555977




Iteration 2, loss = 0.65876761
Iteration 3, loss = 0.64085972
Iteration 4, loss = 0.63303489
Iteration 5, loss = 0.62313058
Iteration 6, loss = 0.61829124
Iteration 7, loss = 0.61236887
Iteration 8, loss = 0.60404994
Iteration 9, loss = 0.60563324
Iteration 10, loss = 0.59751181
Iteration 11, loss = 0.59640858
Iteration 12, loss = 0.59215258
Iteration 13, loss = 0.58745750
Iteration 14, loss = 0.58444033
Iteration 15, loss = 0.58302784
Iteration 16, loss = 0.58091100
Iteration 17, loss = 0.58396349
Iteration 18, loss = 0.57448652
Iteration 19, loss = 0.57164912
Iteration 20, loss = 0.56570873
K Fold it 2
Accuracy: 0.6274509803921569
Precision: 0.66553480475382
Recall: 0.5124183006535947
F-Measure: 0.5790251107828656
AUROC: 0.5031944978427101
AUPR: 0.5501695909820914
Confusion Matrix:
KS test: 0.06
[[568 197]
 [373 392]]





Iteration 1, loss = 0.68248533
Iteration 2, loss = 0.65314712
Iteration 3, loss = 0.63687531
Iteration 4, loss = 0.62643553
Iteration 5, loss = 0.61177484
Iteration 6, loss = 0.60316173
Iteration 7, loss = 0.60026143
Iteration 8, loss = 0.58753599
Iteration 9, loss = 0.58055257
Iteration 10, loss = 0.57436161
Iteration 11, loss = 0.56287586
Iteration 12, loss = 0.55778484
Iteration 13, loss = 0.54643348
Iteration 14, loss = 0.54765290
Iteration 15, loss = 0.54016007
Iteration 16, loss = 0.53130574
Iteration 17, loss = 0.52117662
Iteration 18, loss = 0.51576892
Iteration 19, loss = 0.50276344
Iteration 20, loss = 0.49470109
K Fold it 3




Accuracy: 0.6503267973856209
Precision: 0.650523560209424
Recall: 0.6496732026143791
F-Measure: 0.6500981033355133
AUROC: 0.6027784185569652
AUPR: 0.6407847866322154
Confusion Matrix:
KS test: 0.09
[[498 267]
 [268 497]]

Iteration 1, loss = 0.68249701
Iteration 2, loss = 0.65883465
Iteration 3, loss = 0.64403681
Iteration 4, loss = 0.63128869
Iteration 5, loss = 0.62711376
Iteration 6, loss = 0.61415526
Iteration 7, loss = 0.60776817
Iteration 8, loss = 0.59599049
Iteration 9, loss = 0.59318326
Iteration 10, loss = 0.57633810
Iteration 11, loss = 0.57128107
Iteration 12, loss = 0.56149254
Iteration 13, loss = 0.54973356
Iteration 14, loss = 0.55163631
Iteration 15, loss = 0.53158102
Iteration 16, loss = 0.52149683
Iteration 17, loss = 0.51220162
Iteration 18, loss = 0.51438648
Iteration 19, loss = 0.50122906
Iteration 20, loss = 0.48403231
K Fold it 4
Accuracy: 0.6529411764705882
Precision: 0.6833855799373041
Recall: 0.5699346405228758
F-Measure: 0.6215253029223093
AUROC: 0.5449339997



Iteration 2, loss = 0.65313523
Iteration 3, loss = 0.63952207
Iteration 4, loss = 0.62345642
Iteration 5, loss = 0.61820747
Iteration 6, loss = 0.60470626
Iteration 7, loss = 0.59839797
Iteration 8, loss = 0.58676391
Iteration 9, loss = 0.57518038
Iteration 10, loss = 0.56361568
Iteration 11, loss = 0.55069428
Iteration 12, loss = 0.54585989
Iteration 13, loss = 0.52861499
Iteration 14, loss = 0.52369920
Iteration 15, loss = 0.50693164
Iteration 16, loss = 0.49709760
Iteration 17, loss = 0.50069469
Iteration 18, loss = 0.49766130
Iteration 19, loss = 0.47309087
Iteration 20, loss = 0.46658335
K Fold it 5




Accuracy: 0.6627450980392157
Precision: 0.6936236391912908
Recall: 0.5830065359477125
F-Measure: 0.6335227272727273
AUROC: 0.559842795506002
AUPR: 0.5968546291256449
Confusion Matrix:
KS test: 0.06
[[568 197]
 [319 446]]

Iteration 1, loss = 0.68478905
Iteration 2, loss = 0.66010209
Iteration 3, loss = 0.64492215
Iteration 4, loss = 0.63423256
Iteration 5, loss = 0.62649213
Iteration 6, loss = 0.61770371
Iteration 7, loss = 0.61240575
Iteration 8, loss = 0.60578288
Iteration 9, loss = 0.59837804
Iteration 10, loss = 0.59003593
Iteration 11, loss = 0.58388256
Iteration 12, loss = 0.58263487
Iteration 13, loss = 0.57407410
Iteration 14, loss = 0.56429326
Iteration 15, loss = 0.55712218
Iteration 16, loss = 0.55766480
Iteration 17, loss = 0.54881600
Iteration 18, loss = 0.54444262
Iteration 19, loss = 0.53933141
Iteration 20, loss = 0.52956263




MLPClassifier(activation='relu', alpha=0.0001, batch_size=128, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(12, 12), learning_rate='constant',
              learning_rate_init=0.005, max_iter=20, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=1e-07,
              validation_fraction=0.1, verbose=True, warm_start=False)

In [21]:
# Train Keras MLP

'''input_dims = X_train.shape

model = Sequential()
model.add(Dense(1, input_dim=input_dims[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='mean_squared_error', optimizer='adam')

X_train, y_train = shuffle(X_train, y_train)
X_val, y_val = shuffle(X_val, y_val)
X_val, y_val = np.array(X_val), np.array(y_val)

history = model.fit(X_train, y_train, epochs=2, shuffle=True, batch_size=128, validation_data=(X_val, y_val))
'''

"input_dims = X_train.shape\n\nmodel = Sequential()\nmodel.add(Dense(1, input_dim=input_dims[1], activation='relu'))\nmodel.add(Dropout(0.5))\nmodel.add(Dense(1, activation='sigmoid'))\n\nmodel.compile(loss='mean_squared_error', optimizer='adam')\n\nX_train, y_train = shuffle(X_train, y_train)\nX_val, y_val = shuffle(X_val, y_val)\nX_val, y_val = np.array(X_val), np.array(y_val)\n\nhistory = model.fit(X_train, y_train, epochs=2, shuffle=True, batch_size=128, validation_data=(X_val, y_val))\n"

In [22]:
# Test MLP on validation set

test_model(mlp, X_val, y_val)

Accuracy: 0.584785044039507
Precision: 0.7147792847428971
Recall: 0.6098723617775269
F-Measure: 0.6581717264020035
AUROC: 0.5483018632607544
AUPR: 0.7121323470820173
Confusion Matrix:
KS test: 0.07
[[18005 15520]
 [24880 38894]]


In [23]:
# Test MLP on test set

test_model(mlp, X_test, y_test)

Accuracy: 0.5864329161714326
Precision: 0.7165841351721893
Recall: 0.6104865118561265
F-Measure: 0.6592941379757226
AUROC: 0.5482744191092763
AUPR: 0.7121686237212258
Confusion Matrix:
KS test: 0.07
[[ 53290  45272]
 [ 73033 114465]]
