## Make your own network:
Create a multi-layer perceptron neural network model to predict on a labeled dataset of your choosing. Compare this model to either a boosted tree or a random forest model and describe the relative tradeoffs between complexity and accuracy. Be sure to vary the hyperparameters of your MLP.

## Dataset: [CC Fraud](https://github.com/MitchellEwing/Supervised-Learning-Goulash/blob/master/Fraud-Detection-Credit-Card.ipynb)

In [1]:
# Import modules.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import time

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import Perceptron
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import auc, roc_curve, f1_score

# Aesthetics.
%matplotlib inline
sns.set_style('white')

# Warnings filter: "ConvergenceWarning: Stochastic Optimizer: Maximum iterations (n) reached and
# the optimization hasn't converged yet."
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings('ignore', category=ConvergenceWarning)

  from numpy.core.umath_tests import inner1d


## Model Prep

In [2]:
# Load data.
rawData = pd.read_csv('~/src/data/unit3/creditcard.csv')
rawData.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# Scale time and amount features.
scaled_df = rawData.copy()
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_df['time_scaled'] = scaler.fit_transform(scaled_df['Time'].values.reshape(-1,1))
scaled_df['amount_scaled'] = scaler.fit_transform(scaled_df['Amount'].values.reshape(-1,1))
scaled_df.drop(['Time', 'Amount'], axis=1, inplace=True)

# Repositioning our target variable, scaled time & amount features within the dataframe.
class_var = scaled_df['Class']
time_scaled = scaled_df['time_scaled']
amount_scaled = scaled_df['amount_scaled']
scaled_df.drop(['Class', 'time_scaled', 'amount_scaled'], axis=1, inplace=True)
scaled_df.insert(0, 'class', class_var)
scaled_df.insert(1, 'time_scaled', time_scaled)
scaled_df.insert(2, 'amount_scaled', amount_scaled)
scaled_df.head()

Unnamed: 0,class,time_scaled,amount_scaled,V1,V2,V3,V4,V5,V6,V7,...,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28
0,0,-1.996583,0.244964,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,...,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053
1,0,-1.996583,-0.342475,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,...,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724
2,0,-1.996562,1.160686,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,...,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752
3,0,-1.996562,0.140534,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,...,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458
4,0,-1.996541,-0.073403,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,...,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153


In [4]:
# Rename dataframe.
df2 = scaled_df

# Review class imbalance.
print('\nWhat % of transactions are fraudulent?\n', df2['class'].value_counts(normalize=True))
print('\nHow many transactions are fraudulent?\n', df2['class'].value_counts())
print('where 0 == non-fraud transaction')
print('and 1 == fraud transaction')


What % of transactions are fraudulent?
 0    0.998273
1    0.001727
Name: class, dtype: float64

How many transactions are fraudulent?
 0    284315
1       492
Name: class, dtype: int64
where 0 == non-fraud transaction
and 1 == fraud transaction


In [5]:
# Creating dataframes for merge, separating by class == 0 & class == 1.
df3 = df2.sample(frac=1) #returns random sample of items from an axis of object.
df_fraud = df3.loc[df3['class'] == 1] #fraudulent transactions dataframe.
df_legit = df3.loc[df3['class'] == 0][:492] #includes all indexes with instances of fraud.
model_df = pd.concat([df_fraud, df_legit]) #combine instances with fraud & non-fraud.

# Not done yet, need to return a random sample.
model_df = model_df.sample(frac=1, random_state=42)

# Check for balance.
print('# of indexes within model dataframe classified as fraud & non-fraud:')
print(model_df['class'].value_counts())
print('\nwhere 0 == non-fraudulent transactions')
print('and 1 == fraudulent transactions')

# View model dataframe.
model_df.head()

# of indexes within model dataframe classified as fraud & non-fraud:
1    492
0    492
Name: class, dtype: int64

where 0 == non-fraudulent transactions
and 1 == fraudulent transactions


Unnamed: 0,class,time_scaled,amount_scaled,V1,V2,V3,V4,V5,V6,V7,...,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28
85387,0,-0.717251,-0.114384,1.073577,-0.101197,0.983983,1.383329,-0.817239,-0.276343,-0.293112,...,-0.255289,0.00706,0.128302,0.528449,-0.192418,0.449754,0.656246,-0.223649,0.054986,0.040381
276071,1,1.517626,-0.273468,2.0919,-0.757459,-1.192258,-0.755458,-0.620324,-0.322077,-1.082511,...,0.423099,0.037438,0.288253,0.831939,0.142007,0.592615,-0.196143,-0.136676,0.020182,-0.01547
222665,0,1.016237,-0.263992,2.024262,-0.074472,-1.137708,0.468677,-0.067628,-1.177963,0.295271,...,0.191597,-0.142647,-0.228556,-0.489846,0.258835,0.082729,-0.176782,0.269046,-0.071939,-0.058171
88258,1,-0.689749,-0.326762,-1.644403,3.129852,-2.576977,3.415573,-0.448525,-1.241893,-1.991652,...,-0.133819,0.290187,0.417762,-0.648576,-0.318617,-0.680413,0.389869,0.05575,0.394682,0.298821
8615,1,-1.7517,-0.349231,-3.891192,7.098916,-11.426467,8.607557,-2.065706,-2.985288,-8.138589,...,-0.501751,1.382619,1.757085,-0.189709,-0.508629,-1.189308,1.188536,0.605242,1.881529,0.87526


In [6]:
# Assign features and target.
feature_cols = model_df.iloc[:, 1:].columns
target = model_df['class']

# 75/25 train/test split.
train, test = train_test_split(model_df, test_size=0.25, random_state=42)
X_train = train[feature_cols]
Y_train = train['class']
X_test = test[feature_cols]
Y_test = test['class']

## Baseline Random Forest Classifier

In [7]:
# RFC.
def runRFC():
    start_time = time.time()
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import f1_score, roc_auc_score, auc, roc_curve
    rfc = RandomForestClassifier()
    rfc.fit(X_train, Y_train)
    y_pred = rfc.predict(X_test)
    cvTrain = cross_val_score(rfc, X_train, Y_train, cv=10)
    cvTest = cross_val_score(rfc, X_test, Y_test, cv=10)
    auroc_train = cross_val_score(rfc, X_train, Y_train, cv=10, scoring='roc_auc')
    auroc_test = cross_val_score(rfc, X_test, Y_test, cv=10, scoring='roc_auc')
    #Display.
    print('\nTrain Acc: %0.2f' % rfc.score(X_train, Y_train))
    print('\nTest Acc: %0.2f' % rfc.score(X_test, Y_test))
    print('\nCross-Val Train Acc: %0.2f (+/- %0.2f)' % (cvTrain.mean(), cvTrain.std()*2))
    print('\nCross-Val Test Acc: %0.2f (+/- %0.2f)' % (cvTest.mean(), cvTest.std()*2))
    print('\nF1 Score: %0.2f' % f1_score(Y_test, y_pred))
    print('\nCross-Val Train AUC: %0.2f (+/- %0.2f)' % (auroc_train.mean(), auroc_train.std()*2))
    print('\nCross-Val Test AUC: %0.2f (+/- %0.2f)' % (auroc_test.mean(), auroc_test.std()*2))
    print('\nAUC:', round(roc_auc_score(Y_test, y_pred), 3))
    print('\nTime taken: {} seconds'.format('%0.1f' % (time.time() - start_time)))
    
# View results.
print('Baseline:\n')
runRFC()

Baseline:


Train Acc: 1.00

Test Acc: 0.93

Cross-Val Train Acc: 0.93 (+/- 0.06)

Cross-Val Test Acc: 0.92 (+/- 0.09)

F1 Score: 0.94

Cross-Val Train AUC: 0.97 (+/- 0.03)

Cross-Val Test AUC: 0.96 (+/- 0.07)

AUC: 0.935

Time taken: 1.5 seconds


## Tuning Random Forest Classifier

In [8]:
# Hyperparameter tuning.
# Sklearn's Randomized Search on hyper-parameters.
def randomSearchCV(model, parameters):
    start_time = time.time()
    from sklearn.model_selection import RandomizedSearchCV
    rgrid = RandomizedSearchCV(model, parameters, random_state=42, scoring='roc_auc')
    rgrid.fit(X_train, Y_train)
    print('Parameters via tuning: \n', rgrid.best_params_)
    print('Time taken: {} seconds'.format('%.1f' % (time.time() - start_time)))

# Parameter options.
rfc = RandomForestClassifier()
params_rfc = {'n_estimators': [50, 100, 150],
              'max_depth': [5, 10, 15],
              'min_samples_split': [2, 4, 6],
              'min_samples_leaf': [1, 3, 5]}

# View suggested parameters.
print(randomSearchCV(rfc, params_rfc))

Parameters via tuning: 
 {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_depth': 5}
Time taken: 6.9 seconds
None


In [15]:
# Hyperparameter tuning.
def tryRFC():
    start_time = time.time()
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import f1_score, roc_auc_score, auc, roc_curve
    rfc = RandomForestClassifier(n_estimators=50, min_samples_split=2,
                                 min_samples_leaf=5, max_depth=5)
    rfc.fit(X_train, Y_train)
    y_pred = rfc.predict(X_test)
    cvTrain = cross_val_score(rfc, X_train, Y_train, cv=10)
    cvTest = cross_val_score(rfc, X_test, Y_test, cv=10)
    auroc_train = cross_val_score(rfc, X_train, Y_train, cv=10, scoring='roc_auc')
    auroc_test = cross_val_score(rfc, X_test, Y_test, cv=10, scoring='roc_auc')
    #Display.
    print('\nTrain Acc: %0.2f' % rfc.score(X_train, Y_train))
    print('\nTest Acc: %0.2f' % rfc.score(X_test, Y_test))
    print('\nCross-Val Train Acc: %0.2f (+/- %0.2f)' % (cvTrain.mean(), cvTrain.std()*2))
    print('\nCross-Val Test Acc: %0.2f (+/- %0.2f)' % (cvTest.mean(), cvTest.std()*2))
    print('\nF1 Score: %0.2f' % f1_score(Y_test, y_pred))
    print('\nCross-Val Train AUC: %0.2f (+/- %0.2f)' % (auroc_train.mean(), auroc_train.std()*2))
    print('\nCross-Val Test AUC: %0.2f (+/- %0.2f)' % (auroc_test.mean(), auroc_test.std()*2))
    print('\nAUC:', round(roc_auc_score(Y_test, y_pred), 3))
    print('\nTime taken: {} seconds'.format('%0.1f' % (time.time() - start_time)))
    
# View results.
tryRFC()


Train Acc: 0.96

Test Acc: 0.92

Cross-Val Train Acc: 0.93 (+/- 0.05)

Cross-Val Test Acc: 0.92 (+/- 0.10)

F1 Score: 0.93

Cross-Val Train AUC: 0.97 (+/- 0.03)

Cross-Val Test AUC: 0.97 (+/- 0.06)

AUC: 0.929

Time taken: 5.2 seconds


## Baseline Multi-Level Perceptron Classifier

In [10]:
# MLP.
def runMLP():
    print('Beginning...\n')
    start_time = time.time()
    from sklearn.neural_network import MLPClassifier
    from sklearn.metrics import roc_auc_score, f1_score, auc, roc_curve
    from sklearn.model_selection import cross_val_score
    mlp = MLPClassifier(hidden_layer_sizes=(100,), alpha=0.0001)
    mlp.fit(X_train, Y_train)
    y_pred = mlp.predict(X_test)
    cvTrain = cross_val_score(mlp, X_train, Y_train, cv=10)
    cvTest = cross_val_score(mlp, X_test, Y_test, cv=10)
    auroc_train = cross_val_score(mlp, X_train, Y_train, cv=10, scoring='roc_auc')
    auroc_test = cross_val_score(mlp, X_test, Y_test, cv=10, scoring='roc_auc')
    print('\nTrain Acc: %0.2f' % mlp.score(X_train, Y_train))
    print('\nTest Acc: %0.2f' % mlp.score(X_test, Y_test))
    print('\nCross-Val Train Acc: %0.2f' % cross_val_score(mlp, X_train, Y_train, cv=10).mean())
    print('\nCross-Val Test Acc: %0.2f' % cross_val_score(mlp, X_test, Y_test, cv=10).mean())
    print('\nF1 Score: %0.2f' % f1_score(Y_test, y_pred))
    print('\nCross-Val Train AUC: %0.2f' % auroc_train.mean())
    print('\nCross-Val Test AUC: %0.2f' % auroc_test.mean())
    print('\nAUC:', round(roc_auc_score(Y_test, y_pred), 3))
    print('\n...Done\n')
    print('Time taken: {} seconds'.format('%.1f' % (time.time() - start_time)))
    
# View results.
runMLP()

Beginning...


Train Acc: 0.99

Test Acc: 0.95

Cross-Val Train Acc: 0.95

Cross-Val Test Acc: 0.93

F1 Score: 0.96

Cross-Val Train AUC: 0.98

Cross-Val Test AUC: 0.97

AUC: 0.953

...Done

Time taken: 53.7 seconds


## Tuning MLP

In [12]:
mlp = MLPClassifier()
mlp.get_params().keys()

dict_keys(['activation', 'alpha', 'batch_size', 'beta_1', 'beta_2', 'early_stopping', 'epsilon', 'hidden_layer_sizes', 'learning_rate', 'learning_rate_init', 'max_iter', 'momentum', 'nesterovs_momentum', 'power_t', 'random_state', 'shuffle', 'solver', 'tol', 'validation_fraction', 'verbose', 'warm_start'])

In [13]:
# Hyperparameter tuning.

# Parameter options.
mlp = MLPClassifier()
params_mlp = {'activation': ['identity', 'logistic', 'tanh', 'relu'],
              'solver': ['lbfgs', 'sgd', 'adam'],
              'hidden_layer_sizes': [(100,), (200,), (500,), (1000,)],
              'alpha': [0.0001, 0.001, 0.01, 1, 10, 100]}

# View suggested parameters.
print(randomSearchCV(mlp, params_mlp))

Parameters via tuning: 
 {'solver': 'sgd', 'hidden_layer_sizes': (100,), 'alpha': 0.001, 'activation': 'tanh'}
Time taken: 49.4 seconds
None


In [16]:
# MLP.
def tryMLP():
    print('Beginning...\n')
    start_time = time.time()
    from sklearn.neural_network import MLPClassifier
    from sklearn.metrics import roc_auc_score, f1_score, auc, roc_curve
    from sklearn.model_selection import cross_val_score
    mlp = MLPClassifier(hidden_layer_sizes=(100,), alpha=0.001, solver='sgd', activation='tanh')
    mlp.fit(X_train, Y_train)
    y_pred = mlp.predict(X_test)
    cvTrain = cross_val_score(mlp, X_train, Y_train, cv=10)
    cvTest = cross_val_score(mlp, X_test, Y_test, cv=10)
    auroc_train = cross_val_score(mlp, X_train, Y_train, cv=10, scoring='roc_auc')
    auroc_test = cross_val_score(mlp, X_test, Y_test, cv=10, scoring='roc_auc')
    print('\nTrain Acc: %0.2f' % mlp.score(X_train, Y_train))
    print('\nTest Acc: %0.2f' % mlp.score(X_test, Y_test))
    print('\nCross-Val Train Acc: %0.2f' % cross_val_score(mlp, X_train, Y_train, cv=10).mean())
    print('\nCross-Val Test Acc: %0.2f' % cross_val_score(mlp, X_test, Y_test, cv=10).mean())
    print('\nF1 Score: %0.2f' % f1_score(Y_test, y_pred))
    print('\nCross-Val Train AUC: %0.2f' % auroc_train.mean())
    print('\nCross-Val Test AUC: %0.2f' % auroc_test.mean())
    print('\nAUC:', round(roc_auc_score(Y_test, y_pred), 3))
    print('\n...Done\n')
    print('Time taken: {} seconds'.format('%.1f' % (time.time() - start_time)))
    
# View results.
tryMLP()

Beginning...


Train Acc: 0.95

Test Acc: 0.95

Cross-Val Train Acc: 0.94

Cross-Val Test Acc: 0.91

F1 Score: 0.95

Cross-Val Train AUC: 0.98

Cross-Val Test AUC: 0.98

AUC: 0.951

...Done

Time taken: 65.0 seconds
