# Challenge: Build your own neural network

For this challenge you have two options for how to use neural networks . Choose one of the following:

Use RBM to perform feature extraction on an image-based dataset that you find or create. If you go this route, present the features you extract and explain why this is a useful feature extraction method in the context you’re operating in. DO NOT USE either the MNIST digit recognition database or the iris data set. They’ve been worked on in very public ways very very many times and the code is easily available. (However, that code could be a useful resource to refer to). OR,

Create a multi-layer perceptron neural network model to predict on a labeled dataset of your choosing. Compare this model to either a boosted tree or a random forest model and describe the relative tradeoffs between complexity and accuracy. Be sure to vary the hyperparameters of your MLP!

## Comparing performance of of NN in detecting credit card fraud against a random forest model 

In [1]:
# The basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
creditcard = pd.read_csv('C:\\Users\\User\\Documents\\Python_scripts\\Thinkful\\creditcard.csv')
creditcard.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


## Some adjustments to the data

In [3]:
# Showing class imbalance
creditcard.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

In [4]:
from sklearn import preprocessing
from sklearn import model_selection

# Scale using RobustScale to account for outliers in Amount
scaler = preprocessing.RobustScaler()
norm = scaler.fit_transform(creditcard.loc[:,['Amount', 'Time']])
norm = pd.DataFrame(norm, columns=['Amount', 'Time'])

# Get data into the right shape, drop Amount, Time, Class
credit = creditcard.loc[:, ~((creditcard.columns).isin(['Amount', 'Time', 'Class']))]

# Add transformed Amount and Time for final features (X) and create outcome (Y)
X = pd.concat([credit, norm], axis=1)
y = creditcard['Class']

# Split on X and y, stratifying on y
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, stratify=y)

In [5]:
# Making the X_train set balanced w.r.t y

# First join X_train and y_train again
Xwithy = X_train.copy()
Xwithy['y']=y_train

# Then randomly select as 2 times as many features with y=0 as there are where y=1
countoffrauds = Xwithy.loc[Xwithy['y']==1, 'y'].sum()
notfrauds = Xwithy.loc[Xwithy['y']==0, :].sample(countoffrauds*2, replace=False)

# Get a dataset of only y=1
fraud = Xwithy.loc[Xwithy['y']==1,:]

# Join the two datasets
balanced = pd.concat([notfrauds, fraud])

#Split out X and y again
y_balanced = balanced['y']
X_balanced = balanced.drop('y', axis=1)

print(y_balanced.value_counts())
print(len(X_balanced))

0    738
1    369
Name: y, dtype: int64
1107


## The incumbent: RF

In [6]:
from sklearn import ensemble

# Initialising and fitting the model
rf = ensemble.RandomForestClassifier()
rf.fit(X_train, y_train)

# Testing prediction
y_hat = rf.predict(X_test)

# Assume threshold of .5
#threshold = np.where(y_hat>0.5,1,0)
cross = pd.crosstab(y_hat, y_test)
print(cross)
acc = (cross[0][0]+cross[1][1])/len(y_hat)
sensitive = cross[1][1]/(cross[1][0]+cross[1][1])
print('Accuracy is: %0.3f' % acc)
print('Sensitivity is: %0.3f' % sensitive)

Class      0   1
row_0           
0      71072  33
1          7  90
Accuracy is: 0.999
Sensitivity is: 0.732


In [27]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_hat))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     71079
          1       0.95      0.75      0.84       123

avg / total       1.00      1.00      1.00     71202



In [34]:
# Try StratiefiedKFold
count = 0
skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True,random_state=42)
for train_index, test_index in skf.split(X, y):
    Xs_train, Xs_test = X.iloc[train_index], X.iloc[test_index]
    ys_train, ys_test = y.iloc[train_index], y.iloc[test_index]
    rfc = ensemble.RandomForestClassifier()
    rfc.fit(Xs_train, ys_train)
    y_hat = rf.predict_proba(Xs_test)[:,1]
    # Threshold of 0.3
    threshold = np.where(y_hat>0.3,1,0)
    cross = pd.crosstab(threshold, ys_test)
    print(cross)
    acc = (cross[0][0]+cross[1][1])/len(y_hat)
    sensitive = cross[1][1]/(cross[1][0]+cross[1][1])
    count += 1
    print('Accuracy for fold {} is {}'.format(count, acc))
    print('Sensitivity for fold {} is {}'.format(count, sensitive))

Class      0   1
row_0           
0      56856   3
1          7  96
Accuracy for fold 1 is 0.9998244443664197
Sensitivity for fold 1 is 0.9696969696969697
Class      0   1
row_0           
0      56860   4
1          3  95
Accuracy for fold 2 is 0.9998771110564938
Sensitivity for fold 2 is 0.9595959595959596
Class      0   1
row_0           
0      56860   4
1          3  94
Accuracy for fold 3 is 0.9998771088990713
Sensitivity for fold 3 is 0.9591836734693877
Class      0   1
row_0           
0      56858   8
1          5  90
Accuracy for fold 4 is 0.9997717736697038
Sensitivity for fold 4 is 0.9183673469387755
Class      0   1
row_0           
0      56860   6
1          3  92
Accuracy for fold 5 is 0.9998419971559488
Sensitivity for fold 5 is 0.9387755102040817


## The challenger: NN

In [9]:
# Import the model.
from sklearn.neural_network import MLPClassifier

# Establish and fit the model.
# Reduce iterations to 200
mlp = MLPClassifier(hidden_layer_sizes=(28,), max_iter=200)
mlp.fit(X_train, y_train)

print("Train score: %0.3f" % mlp.score(X_train, y_train))

#Fit to test
y_hat = mlp.predict(X_test)

cross = pd.crosstab(y_hat, y_test)
print(cross)
acc = (cross[0][0]+cross[1][1])/len(y_hat)
sensitive = cross[1][1]/(cross[1][0]+cross[1][1])
print('Accuracy is: %0.3f' % acc)
print('Sensitivity is: %0.3f' % sensitive)

Train score: 0.999
Class      0   1
row_0           
0      71066  26
1         13  97
Accuracy is: 0.999
Sensitivity is: 0.789


Amazing. First attempt and the NN performs better in terms of accuracy and sensitivity!

In [10]:
from sklearn import metrics
print(metrics.classification_report(y_test, y_hat))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     71079
          1       0.88      0.79      0.83       123

avg / total       1.00      1.00      1.00     71202



In [13]:
# Establish and fit the model.
mlp = MLPClassifier(hidden_layer_sizes=(29,), max_iter=200)
mlp.fit(X_train, y_train)

print("Train score: %0.3f" % mlp.score(X_train, y_train))

#Fit to test
y_hat = mlp.predict(X_test)

cross = pd.crosstab(y_hat, y_test)
print(cross)
acc = (cross[0][0]+cross[1][1])/len(y_hat)
sensitive = cross[1][1]/(cross[1][0]+cross[1][1])
print('Accuracy is: %0.3f' % acc)
print('Sensitivity is: %0.3f' % sensitive)

print(metrics.classification_report(y_test, y_hat))

Train score: 1.000
Class      0   1
row_0           
0      71069  27
1         10  96
Accuracy is: 0.999
Sensitivity is: 0.780
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     71079
          1       0.91      0.78      0.84       123

avg / total       1.00      1.00      1.00     71202



In [35]:
# Okay, but how does it do in CV

# Try StratiefiedKFold
count = 0
skf = model_selection.StratifiedKFold(n_splits=5, shuffle=True,random_state=42)
for train_index, test_index in skf.split(X, y):
    Xs_train, Xs_test = X.iloc[train_index], X.iloc[test_index]
    ys_train, ys_test = y.iloc[train_index], y.iloc[test_index]
    mlp = MLPClassifier(hidden_layer_sizes=(29,), max_iter=200)
    mlp.fit(Xs_train, ys_train)
    y_hat = mlp.predict(Xs_test)
    cross = pd.crosstab(y_hat, ys_test)
    print(cross)
    acc = (cross[0][0]+cross[1][1])/len(y_hat)
    sensitive = cross[1][1]/(cross[1][0]+cross[1][1])
    count += 1
    print('Accuracy for fold {} is {}'.format(count, acc))
    print('Sensitivity for fold {} is {}'.format(count, sensitive))

Class      0   1
row_0           
0      56855  20
1          8  79
Accuracy for fold 1 is 0.9995084442259752
Sensitivity for fold 1 is 0.797979797979798
Class      0   1
row_0           
0      56853  21
1         10  78
Accuracy for fold 2 is 0.9994557775359011
Sensitivity for fold 2 is 0.7878787878787878
Class      0   1
row_0           
0      56857  24
1          6  74
Accuracy for fold 3 is 0.9994733238531627
Sensitivity for fold 3 is 0.7551020408163265
Class      0   1
row_0           
0      56856  26
1          7  72
Accuracy for fold 4 is 0.999420656238479
Sensitivity for fold 4 is 0.7346938775510204
Class      0   1
row_0           
0      56851  17
1         12  81
Accuracy for fold 5 is 0.999490879724724
Sensitivity for fold 5 is 0.826530612244898


So the NN does better, but in CV the sensitivity performs worse (but is still consistent so no overfitting)

In [12]:
# Adding a second layers

mlp = MLPClassifier(hidden_layer_sizes=(29,14), max_iter=200)
mlp.fit(X_train, y_train)

print("Train score: %0.3f" % mlp.score(X_train, y_train))

#Fit to test
y_hat = mlp.predict(X_test)

cross = pd.crosstab(y_hat, y_test)
print(cross)
acc = (cross[0][0]+cross[1][1])/len(y_hat)
sensitive = cross[1][1]/(cross[1][0]+cross[1][1])
print('Accuracy is: %0.3f' % acc)
print('Sensitivity is: %0.3f' % sensitive)

print(metrics.classification_report(y_test, y_hat))

Train score: 1.000
Class      0   1
row_0           
0      71068  25
1         11  98
Accuracy is: 0.999
Sensitivity is: 0.797
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     71079
          1       0.90      0.80      0.84       123

avg / total       1.00      1.00      1.00     71202



In [14]:
# Adding a third layer

mlp = MLPClassifier(hidden_layer_sizes=(29,14,7), max_iter=200)
mlp.fit(X_train, y_train)

print("Train score: %0.3f" % mlp.score(X_train, y_train))

#Fit to test
y_hat = mlp.predict(X_test)

cross = pd.crosstab(y_hat, y_test)
print(cross)
acc = (cross[0][0]+cross[1][1])/len(y_hat)
sensitive = cross[1][1]/(cross[1][0]+cross[1][1])
print('Accuracy is: %0.3f' % acc)
print('Sensitivity is: %0.3f' % sensitive)

print(metrics.classification_report(y_test, y_hat))

Train score: 1.000
Class      0   1
row_0           
0      71069  28
1         10  95
Accuracy is: 0.999
Sensitivity is: 0.772
             precision    recall  f1-score   support

          0       1.00      1.00      1.00     71079
          1       0.90      0.77      0.83       123

avg / total       1.00      1.00      1.00     71202



In [15]:
# Establish and fit the model.
# Reduce iterations to 200
mlp = MLPClassifier(hidden_layer_sizes=(28,), max_iter=200, activation='logistic')
mlp.fit(X_train, y_train)

print("Train score: %0.3f" % mlp.score(X_train, y_train))

#Fit to test
y_hat = mlp.predict(X_test)

cross = pd.crosstab(y_hat, y_test)
print(cross)
acc = (cross[0][0]+cross[1][1])/len(y_hat)
sensitive = cross[1][1]/(cross[1][0]+cross[1][1])
print('Accuracy is: %0.3f' % acc)
print('Sensitivity is: %0.3f' % sensitive)

Train score: 0.999
Class      0   1
row_0           
0      71064  28
1         15  95
Accuracy is: 0.999
Sensitivity is: 0.772


In [16]:
params = {'hidden_layer_sizes':[(29,), (50,)],
          'max_iter': [200, 400],
          #'activation': ['logistic', 'relu'],
          #'solver' : ['sgd', 'adam']
         }

# Initialize the model
mlp = MLPClassifier()

# Apply GridSearch to the model
grid = model_selection.GridSearchCV(mlp, params)
grid.fit(X_train, y_train)

# Save model for use in CV
sv_best = grid.best_estimator_
print(grid.best_estimator_)

print("Accuracy is: %0.3f" % grid.score(X_test, y_test))


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(50,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=400, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)
Accuracy is: 1.000


NameError: name 'dataset_test' is not defined

In [17]:

y_hat = grid.predict(X_test)
print(metrics.classification_report(y_test, y_hat))
cross = pd.crosstab(y_hat, y_test)
print(cross)

             precision    recall  f1-score   support

          0       1.00      1.00      1.00     71079
          1       0.92      0.78      0.85       123

avg / total       1.00      1.00      1.00     71202

Class      0   1
row_0           
0      71071  27
1          8  96
