In [126]:
from keras.models import Sequential
from keras.layers import Dense
import pandas as pd
import numpy
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score, precision_score, recall_score, f1_score, accuracy_score
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [4]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

In [5]:
from imblearn.over_sampling import SMOTE

In [6]:
def plotROC(fpr, tpr):
    plt.figure()
    plt.plot(fpr, tpr, label='ROC curve, area = {:.2}'.format(auc(fpr, tpr)))
    plt.plot([0,1], [0,1], 'k--')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.show()

In [41]:
features = numpy.loadtxt('modifiedTraining.csv', delimiter=',')
labels = numpy.loadtxt('labels.csv', delimiter=',')

In [42]:
features.shape

(15420, 159)

In [43]:
labels.shape

(15420,)

In [140]:
trainingX, testX, trainingY, testY = train_test_split(features, labels, test_size=0.3, random_state=1)

In [141]:
# using SMOTE for over-sampling
sm = SMOTE(kind='regular')

In [142]:
x, y = sm.fit_sample(trainingX, trainingY)

In [143]:
print('Number of fraudulent cases = {}\nNumber of true cases = {}'.format(sum(y[:] == 1), sum(y[:] == 0)))

Number of fraudulent cases = 10180
Number of true cases = 10180


In [144]:
# normalizing training data to zero mean and unit standard deviation
normalizedX = x - numpy.mean(x, axis=0)
sd = numpy.std(normalizedX, axis=0)
sd[sd[:] == 0] = 1
normalizedX /= sd

In [146]:
ohe1 = OneHotEncoder()
ohe1.fit(y.reshape(-1,1))
transformedTrainingY = ohe1.transform(y.reshape(-1,1)).toarray()

In [147]:
normalizedTestX = testX - numpy.mean(testX, axis=0)
sd = numpy.std(normalizedTestX, axis=0)
sd[sd[:] == 0] = 1
normalizedTestX /= sd

In [56]:
from tensorflow import set_random_seed

In [77]:
nodesInHiddenLayer = list(range(20,91,5))
predictions = numpy.zeros((testX.shape[0], len(nodesInHiddenLayer)))

In [81]:
for i,nodes in enumerate(nodesInHiddenLayer):
    print('Number of hidden nodes = {}'.format(nodes))
    set_random_seed(2)
    numpy.random.seed(2)
    model = Sequential()
    model.add(Dense(nodes, activation='relu', input_shape=(159,)))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer='sgd')
    model.fit(x, transformedTrainingY, batch_size = 64, epochs=100, verbose=0)
    loss, accuracy = model.evaluate(testX, ohe1.transform(testY.reshape(-1,1)).toarray(), verbose=0)
    predicted = model.predict(testX)
    classes = numpy.argmax(predicted, axis=1)
    print('Test Loss = {} , Test Accuracy = {}, AUC = {}'.format(loss, accuracy, roc_auc_score(testY, predicted[:,1])))
    # Confusion Matrix
    predictions[:,i] = classes
    print(confusion_matrix(testY, predictions[:,i]))

Number of hidden nodes = 20
Test Loss = 0.725697981972381 , Test Accuracy = 0.805447470714043, AUC = 0.7239707845778675
[[3598  719]
 [ 181  128]]
Number of hidden nodes = 25
Test Loss = 0.6932991828642209 , Test Accuracy = 0.804366623355463, AUC = 0.7468021736897776
[[3588  729]
 [ 176  133]]
Number of hidden nodes = 30
Test Loss = 0.65151011453232 , Test Accuracy = 0.8272805878778995, AUC = 0.7313398598001578
[[3710  607]
 [ 192  117]]
Number of hidden nodes = 35
Test Loss = 0.6359370561534844 , Test Accuracy = 0.832252485948984, AUC = 0.7416400727761772
[[3715  602]
 [ 174  135]]
Number of hidden nodes = 40
Test Loss = 0.5951815889322856 , Test Accuracy = 0.829658452252315, AUC = 0.7442728491933375
[[3716  601]
 [ 187  122]]
Number of hidden nodes = 45
Test Loss = 0.5907183536978651 , Test Accuracy = 0.82641590997042, AUC = 0.7419339362031495
[[3702  615]
 [ 188  121]]
Number of hidden nodes = 50
Test Loss = 0.6083763272433481 , Test Accuracy = 0.8277129268574086, AUC = 0.7478089557

In [85]:
from collections import Counter
yPredicted = []
for i in range(predictions.shape[0]):
    counter = Counter(predictions[i,:])
    print(counter)
    if counter[0] > counter[1]:
        yPredicted.append(0)
    else:
        yPredicted.append(1)

Counter({1.0: 15})
Counter({0.0: 15})
Counter({0.0: 10, 1.0: 5})
Counter({0.0: 15})
Counter({0.0: 8, 1.0: 7})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 10, 1.0: 5})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 12, 1.0: 3})
Counter({0.0: 15})
Counter({0.0: 13, 1.0: 2})
Counter({0.0: 15})
Counter({0.0: 12, 1.0: 3})
Counter({0.0: 12, 1.0: 3})
Counter({0.0: 15})
Counter({1.0: 8, 0.0: 7})
Counter({0.0: 15})
Counter({0.0: 13, 1.0: 2})
Counter({0.0: 15})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({1.0: 12, 0.0: 3})
Counter({0.0: 9, 1.0: 6})
Counter({0.0: 15})
Counter({1.0: 9, 0.0: 6})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({1.0: 10, 0.0: 5})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 14, 1.0: 1})
Counter({1.0: 11, 0.0: 4})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter(

Counter({0.0: 15})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({1.0: 8, 0.0: 7})
Counter({0.0: 15})
Counter({1.0: 12, 0.0: 3})
Counter({1.0: 14, 0.0: 1})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({1.0: 9, 0.0: 6})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 11, 1.0: 4})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 13, 1.0: 2})
Counter({0.0: 15})
Counter({0.0: 10, 1.0: 5})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({1.0: 12, 0.0: 3})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 12, 1.0: 3})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({1.0: 8, 0.0: 7})
Counter({0.0: 15})
Counter({0.0: 13, 1.0: 2})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 14, 1.0: 1})
Counter({1.0: 13, 0.0: 2})
Counter({0

Counter({1.0: 10, 0.0: 5})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({1.0: 14, 0.0: 1})
Counter({0.0: 15})
Counter({0.0: 9, 1.0: 6})
Counter({0.0: 15})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({0.0: 9, 1.0: 6})
Counter({0.0: 15})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 11, 1.0: 4})
Counter({0.0: 15})
Counter({1.0: 10, 0.0: 5})
Counter({0.0: 15})
Counter({0.0: 9, 1.0: 6})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 12, 1.0: 3})
Counter({0.0: 15})
Counter({1.0: 11, 0.0: 4})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 13, 1.0: 2})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({1.0: 13, 0.0: 2})
Counter({0.0: 14, 1.0: 1})
Counter({1.0: 10, 0.0: 5})
Counter({0.0: 15})
Counter({0.0: 13, 1.0: 2})
Counter({0.0: 9, 1.0: 6})
Counter({0.0: 15})
Counter({0.0: 12, 1.0: 3})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})


Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 11, 1.0: 4})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({0.0: 12, 1.0: 3})
Counter({0.0: 8, 1.0: 7})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({1.0: 11, 0.0: 4})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 13, 1.0: 2})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({1.0: 11, 0.0: 4})
Counter({0.0: 11, 1.0: 4})
Counter({0.0: 13, 1.0: 2})
Counter({0.0: 15})
Counter({0.0: 13, 1.0: 2})
Counter({0.0: 13, 1.0: 2})
Counter({0.0: 15})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({0.0: 15})

Counter({0.0: 15})
Counter({1.0: 10, 0.0: 5})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({1.0: 8, 0.0: 7})
Counter({0.0: 15})
Counter({1.0: 12, 0.0: 3})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 8, 1.0: 7})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 12, 1.0: 3})
Counter({1.0: 9, 0.0: 6})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({1.0: 10, 0.0: 5})
Counter({0.0: 15})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({0.0: 12, 1.0: 3})
Counter({0.0: 12, 1.0: 3})
Counter({0.0: 12, 1.0: 3})
Counter({0.0: 15})
Counter({0.0: 9, 1.0: 6})
Counter({1.0: 12, 0.0: 3})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 10, 1.0: 5})
Counter({0.0: 15})
Counter({1.0: 13, 0.0: 2})
Counter({1.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 15})
Counter({0.0: 14, 1.0: 1})
Counter({0.0: 15})
Counter({0.0: 13, 1.0: 2})
Counter({1.0: 10, 0.0: 5})
Counter({0.0: 15})
Counter({0.0: 9,

In [84]:
confusion_matrix(testY, yPredicted)

array([[3863,  454],
       [ 189,  120]])

In [89]:
precision_score(testY, yPredicted)

0.20905923344947736

In [90]:
recall_score(testY, yPredicted)

0.3883495145631068

In [92]:
f1_score(testY, yPredicted)

0.2718006795016987

In [94]:
accuracy_score(testY, yPredicted)

0.8610030263726762

DTree

In [149]:
dtreeClf = DecisionTreeClassifier(random_state=10, criterion='gini')

In [156]:
dtreeClf.fit(x, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=10,
            splitter='best')

In [157]:
yPredictedDtree = dtreeClf.predict(testX)

In [158]:
confusion_matrix(testY, yPredictedDtree)

array([[4082,  235],
       [ 244,   65]])

In [159]:
accuracy_score(testY, yPredictedDtree)

0.8964548205793342

RandomForest

In [162]:
rfClf = RandomForestClassifier(random_state=1)
rfClf.fit(x, y)
yPredictedRf = rfClf.predict(testX)
print(confusion_matrix(testY, yPredictedRf))

[[4314    3]
 [ 301    8]]


In [163]:
accuracy_score(testY, yPredictedRf)

0.9342844790315608