In [123]:
# Import dependencies

import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score, average_precision_score
from imblearn.over_sampling import SMOTE
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.utils import shuffle

In [127]:
# Load data

ds = pd.read_csv('TRN', sep='\t')

In [151]:
# Select columns

features = ds.columns.drop(['IND_BOM_1_1', 'IND_BOM_1_2'])
X = ds[features]
y = ds['IND_BOM_1_1']

In [152]:
# Separate data for train, validation and test
# Train: 1/2
# Validation: 1/4
# Test: 1/4

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, stratify=y, random_state=43)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=.33, stratify=y_train, random_state=44)

In [153]:
# Rebalance data (class 0 has way less samples than class 1)

sm = SMOTE(random_state=45)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [154]:
# Train the model (learn parameters)

# clf = MLPClassifier(solver='lbfgs', alpha=1e-4, learning_rate_init=.001, hidden_layer_sizes=(8,8,8), random_state=1, activation='relu')
# clf.fit(X_train, y_train)

input_dims = X_train.shape

model = Sequential()
model.add(Dense(1, input_dim=input_dims[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='mean_squared_error', optimizer='adam')

X_train, y_train = shuffle(X_train, y_train)
X_val, y_val = shuffle(X_val, y_val)
X_val, y_val = np.array(X_val), np.array(y_val)

history = model.fit(X_train, y_train, epochs=2, shuffle=True, batch_size=128, validation_data=(X_val, y_val))

Train on 256372 samples, validate on 97299 samples
Epoch 1/2
Epoch 2/2


In [161]:
# Predict for test dataset

#results = clf.predict(X_val)
#results_probs = clf.predict_proba(X_val)

#loss, _ = model.evaluate(X_val, y_val, batch_size=128)
results = model.predict_classes(X_test, batch_size=128)

In [162]:
print(len([v for v in results if v == 1]))
print(history.history)

96323
{'val_loss': [0.2507265499436263, 0.24994944172431294], 'loss': [0.25000850877995745, 0.25000788952049835]}


In [44]:
# Compute metrics

# Accuracy: (TP + TN) / N
# Precision: TP / (TP + FP)
# Recall: TP / (TP + FN)
# F1-Measure: Harmonic average between Precision and Recall

def compute_metrics(pred, pred_probs, y):
    cm = confusion_matrix(y_true=y, y_pred=pred)
    tn, fp, fn, tp = cm.ravel()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    acc = (tp+tn) / (tp+tn+fp+fn)
    f_measure = f1_score(y, pred)
    roc_auc = roc_auc_score(y, pred_probs[:,0])
    pr_auc = average_precision_score(y, pred_probs[:,0])
    print(tn, fp, fn, tp)
    return acc, precision, recall, f_measure, roc_auc, pr_auc, cm

acc, prec, rec, f_measure, roc_auc, pr_auc, cm = compute_metrics(results, results_probs, y_val)
print(acc, prec, rec, f_measure, roc_auc, pr_auc, cm)

0 33189 0 63138
0.6554548568937059 0.6554548568937059 1.0 0.7918728247577839 0.4999754691878363 0.6554437770653577 [[    0 33189]
 [    0 63138]]


In [45]:
# Predict for validation dataset

results = clf.predict(X_test)
results_probs = clf.predict_proba(X_test)

In [46]:
acc, prec, rec, f_measure, roc_auc, pr_auc, cm = compute_metrics(results, results_probs, y_test)
print(acc, prec, rec, f_measure, roc_auc, pr_auc, cm)

1 33524 0 63774
0.6554538073361494 0.6554502661925219 1.0 0.7918694745207112 0.49996156590151364 0.6554235627229548 [[    1 33524]
 [    0 63774]]
