In [1]:
# Import dependencies

import numpy as np
import pandas as pd
from scipy.stats import ks_2samp
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, average_precision_score
from imblearn.over_sampling import SMOTE, SMOTENC, BorderlineSMOTE, ADASYN, SVMSMOTE
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
import scikitplot as skplt
import matplotlib
import matplotlib.pyplot as plt

from collections import Counter

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Load data

ds = pd.read_csv('TRN', sep='\t')

In [None]:
# Select columns

features = ds.columns.drop(['INDEX', 'IND_BOM_1_1', 'IND_BOM_1_2'])
X = ds[features]
y = ds['IND_BOM_1_1']

In [None]:
# Selecting features usign RandomForest

sel = SelectFromModel(RandomForestClassifier(n_estimators = 100))
sel = sel.fit(X,y)
X = sel.fit_transform(X,y)

In [None]:
# Converting X+y to Dataframe 

#  X = pd.DataFrame(data=X)
# y = pd.DataFrame(data=y)
# ds = pd.concat([X,y], axis=1)

# Generating heatmap of correlations

# import seaborn as sns
# import matplotlib.pyplot as plt

# corrmat = ds.corr()
# top_corr_features = corrmat.index
# plt.figure(figsize=(300,300))
# #plot heat map
# g=sns.heatmap(ds[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
# bestfeatures = SelectKBest(score_func=f_classif, k=50)
# X = bestfeatures.fit_transform(X, y)

In [None]:
# Separate data for train, validation and test
# Train: 1/2
# Validation: 1/4
# Test: 1/4

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, stratify=y, random_state=43)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=.95, stratify=y_train, random_state=44)

In [None]:
cat_cols = [0, 1, 2, 3, 4, 5, 6, 8, 11, 12, 13, 14, 15, 16, 17, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 55, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242]
sm = BorderlineSMOTE(random_state=123, sampling_strategy='minority')
X_train, y_train = sm.fit_resample(X_train, y_train)

In [None]:
X_train, y_train = shuffle(X_train, y_train)

In [None]:
# Utility functions for testing models

# Accuracy: (TP + TN) / N
# Precision: TP / (TP + FP)
# Recall: TP / (TP + FN)
# F1-Measure: Harmonic average between Precision and Recall

def compute_metrics(pred, pred_probs, y, neg_class=0):
    cm = confusion_matrix(y_true=y, y_pred=pred)
    tn, fp, fn, tp = cm.ravel()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    acc = (tp+tn) / (tp+tn+fp+fn)
    f_measure = f1_score(y, pred)
    
    pred_probs = np.array([v[0] if v[0] > v[1] else v[1] for v in pred_probs])
    roc_auc = roc_auc_score(y, pred_probs)
    pr_auc = average_precision_score(y, pred_probs)
    gd = generate_dist(pred_probs, np.array(y), neg_class)
    ks = ks_2samp(gd[0], gd[1])[0]
    return acc, precision, recall, f_measure, roc_auc, pr_auc, ks, cm

def report_performance_metrics(pred, pred_probs, _y, neg_class=0):
    acc, prec, rec, f_measure, roc_auc, pr_auc, ks, cm = compute_metrics(pred, pred_probs, _y, neg_class)
    skplt.metrics.plot_ks_statistic(_y, pred_probs)
    plt.show()
    print('Accuracy:', acc)
    print('Precision:', prec)
    print('Recall:', rec)
    print('F-Measure:', f_measure)
    print('AUROC:', roc_auc)
    print('AUPR:', pr_auc)
    print('Confusion Matrix:')
    print(cm)
    
def train_test_k_fold(k, clf, _X, _y, neg_class=0):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=11)
    it = 1
    for train_index, test_index in skf.split(_X, _y):
        X_batch = _X[train_index]
        y_batch = _y[train_index]
        X_test_batch = _X[test_index]
        y_test_batch = _y[test_index]
        clf.fit(X_batch, y_batch)
        results = clf.predict(X_test_batch)
        results_probs = clf.predict_proba(X_test_batch)
        print('K Fold it', it)
        report_performance_metrics(results, results_probs, y_test_batch, neg_class)
        print('')
        it += 1
        
def test_model(clf, _X, _y):
    results = clf.predict(_X)
    results_probs = clf.predict_proba(_X)
    report_performance_metrics(results, results_probs, _y)
    
def generate_dist(probas, y, neg_class):
    dist_1 = [0] * 100
    dist_2 = [0] * 100
    for_perc = Counter(y)
    for i in range(1,101):
        limiar = i/100        
        lower = y[np.where(probas <= limiar)]
        count = Counter(lower)
        dist_1[i-1] = count[neg_class]
        dist_2[i-1] = count[1]
    return np.array(dist_1)/for_perc[neg_class], np.array(dist_2)/for_perc[1]

In [None]:
#_X_train = np.delete(X_train, cat_cols, 1)
#kk = np.delete(X_val, cat_cols, 1)

In [None]:
'''params = {
    'kernel': ['rbf', 'linear', 'poly'],
    'C': [.0001, .01, 1, 10, 100],
    'gamma': [.0001, .001, .01, .1, 1, 10, 100],
}

clf = svm.SVC(decision_function_shape='ovo', verbose=True, max_iter=500)
grid = GridSearchCV(estimator=clf, param_grid=params)
search = grid.fit(X_train, y_train)
'''

In [None]:
#params = search.best_params_
#params

In [None]:
# Train SVM with k fold 

params = {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}

svm_clf = svm.SVC(**params, decision_function_shape='ovo', 
              verbose=True, max_iter=200, probability=True)

_y_train = np.array([v if v == 1 else -1 for v in y_train])

train_test_k_fold(5, svm_clf, X_train, _y_train)

In [None]:
# Test SVM  on validation set

_y_val = np.array([v if v == 1 else -1 for v in y_val])
test_model(svm_clf, X_val, _y_val)

In [None]:
# Test SVM on test set

_y_test = np.array([v if v == 1 else -1 for v in y_test])
test_model(svm_clf, X_test, _y_test)

In [None]:
# Define MLP ensemble (Bagging method) and train it

_mlp = MLPClassifier(hidden_layer_sizes= (12, 12), learning_rate_init= 0.005, solver= 'adam',
                     alpha=1e-4, verbose=True, activation='relu', batch_size=128, max_iter=20, tol=1e-7)

mlp_ensemble = BaggingClassifier(base_estimator=_mlp, n_estimators=5, max_samples=.2, 
                  bootstrap=False, bootstrap_features=True, n_jobs=8, verbose=True)

mlp_ensemble.fit(X_train, y_train)

In [None]:
# Test MLP ensemble on validation set

test_model(mlp_ensemble, X_val, y_val)

In [None]:
# Test MLP Ensemble on test set 

test_model(mlp_ensemble, X_test, y_test)

In [None]:
# Run grid search to find best parameters
'''
    Best parameters found:
        Solver: adam
        hidden_layers: (12,12)
        learning_rate: 0.005
'''


'''solvers = ['lbfgs', 'adam',]
hidden_layers = [(12,12), (12,12,12), (5,5,5), (5,5)]
learning_rates = [.001, .0001, .005]

clf = MLPClassifier()
grid = GridSearchCV(estimator=clf, param_grid=dict(solver=solvers, hidden_layer_sizes=hidden_layers, learning_rate_init=learning_rates))
search = grid.fit(X_train, y_train)
'''

In [None]:
# Rename best parameters found on grid search

params = search.best_params_

# params = {
#     'solver': 'adam',
#     'hidden_layer_sizes': (12,12),
#     'learning_rate_init': .005,
# }

In [None]:
# Train MLP with best parameters found on grid search
mlp = MLPClassifier(**params, alpha=1e-4, verbose=True, activation='relu', batch_size=128, max_iter=20, tol=1e-7)

# Uncomment below line to run k fold on the MLP
train_test_k_fold(5, mlp, X_train, y_train)
mlp.fit(X_train, y_train)

In [None]:
# Train Keras MLP

'''input_dims = X_train.shape

model = Sequential()
model.add(Dense(1, input_dim=input_dims[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='mean_squared_error', optimizer='adam')

X_train, y_train = shuffle(X_train, y_train)
X_val, y_val = shuffle(X_val, y_val)
X_val, y_val = np.array(X_val), np.array(y_val)

history = model.fit(X_train, y_train, epochs=2, shuffle=True, batch_size=128, validation_data=(X_val, y_val))
'''

In [None]:
# Test MLP on validation set
test_model(mlp, X_val, y_val)

In [None]:
# Test MLP on test set
test_model(mlp, X_test, y_test)