In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras_tuner as kt
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score


In [2]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print("REPLICAS: ", strategy.num_replicas_in_sync)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

REPLICAS:  1
Num GPUs Available:  1


In [3]:
df = pd.read_csv('final_project.csv')

In [13]:
print(df)

              x0        x1         x2         x3        x4         x5  \
0      -0.166563 -3.961588   4.621113   2.481908 -1.800135   0.804684   
1      -0.149894 -0.585676  27.839856   4.152333  6.426802  -2.426943   
2      -0.321707 -1.429819  12.251561   6.586874 -5.304647 -11.311090   
3      -0.245594  5.076677 -24.149632   3.637307  6.505811   2.290224   
4      -0.273366  0.306326 -11.352593   1.676758  2.928441  -0.616824   
...          ...       ...        ...        ...       ...        ...   
159995 -0.487024 -4.270269   0.417395  -1.992423  1.757552  -1.167819   
159996  0.825477  4.804368  22.161535  11.371303  1.715901   6.990759   
159997 -0.802489  5.362696   7.243419  -7.496074  2.295250  -2.756067   
159998  0.339237  7.609895   5.368414  -2.825481  4.046102  15.322603   
159999 -0.296748 -0.412773 -10.911407  -5.633629 -4.028154  15.939428   

               x6         x7         x8        x9  ...        x41       x42  \
0        6.718751 -14.789997  -1.040673 -4.2

In [187]:
df.dtypes

x0     float64
x1     float64
x2     float64
x3     float64
x4     float64
x5     float64
x6     float64
x7     float64
x8     float64
x9     float64
x10    float64
x11    float64
x12    float64
x13    float64
x14    float64
x15    float64
x16    float64
x17    float64
x18    float64
x19    float64
x20    float64
x21    float64
x22    float64
x23    float64
x24     object
x25    float64
x26    float64
x27    float64
x28    float64
x29     object
x30     object
x31    float64
x32     object
x33    float64
x34    float64
x35    float64
x36    float64
x37     object
x38    float64
x39    float64
x40    float64
x41    float64
x42    float64
x43    float64
x44    float64
x45    float64
x46    float64
x47    float64
x48    float64
x49    float64
y        int64
dtype: object

Check data quality

In [188]:
count_na = df.isna().sum()
print(count_na)
class_counts = df['y'].value_counts()
print(class_counts)

x0     26
x1     25
x2     38
x3     37
x4     26
x5     37
x6     26
x7     27
x8     21
x9     30
x10    43
x11    30
x12    36
x13    31
x14    34
x15    35
x16    26
x17    27
x18    40
x19    35
x20    38
x21    29
x22    27
x23    47
x24    28
x25    22
x26    36
x27    30
x28    35
x29    30
x30    30
x31    39
x32    31
x33    41
x34    41
x35    30
x36    27
x37    23
x38    31
x39    23
x40    36
x41    40
x42    26
x43    37
x44    40
x45    29
x46    31
x47    37
x48    32
x49    32
y       0
dtype: int64
0    95803
1    64197
Name: y, dtype: int64


Handling the % in 'x32' by stripping the % and converting to float, then divide by 100

In [189]:
df['x32'] = df['x32'].str.replace('%', '')
df['x32'] = pd.to_numeric(df['x32']) / 100
df['x32']

0         0.0000
1        -0.0002
2        -0.0001
3         0.0001
4         0.0001
           ...  
159995    0.0000
159996   -0.0001
159997   -0.0000
159998   -0.0002
159999    0.0002
Name: x32, Length: 160000, dtype: float64

Handling column 'x37' by stripping the $ and converting it to a float64

In [190]:
df['x37'] = df['x37'].str.replace('$', '')
df['x37'] = pd.to_numeric(df['x37'])
df['x37']

  df['x37'] = df['x37'].str.replace('$', '')


0         1313.96
1         1962.78
2          430.47
3        -2366.29
4         -620.66
           ...   
159995    -891.96
159996    1588.65
159997     687.46
159998     439.21
159999   -1229.34
Name: x37, Length: 160000, dtype: float64

Imputing all of the missing data with either mean imputation for numerical or most frequent for category imputation.

In [191]:
num_cols = [f"x{i}" for i in range(50) if i not in [24, 29, 30]]
cat_cols = ["x24", "x29", "x30"]
#copy the target before doing the transform since it gets dropped
y = df['y'].values

num_imputer = SimpleImputer(strategy='mean')  
cat_imputer = SimpleImputer(strategy='most_frequent')

transformer = ColumnTransformer(
    transformers=[
        ('num_imputer', num_imputer, num_cols),
        ('cat_imputer', cat_imputer, cat_cols)
    ])

df_imputed = pd.DataFrame(transformer.fit_transform(df), columns=num_cols+cat_cols)
df_imputed.index = df.index

In [192]:
count_na = df_imputed.isna().sum()
print(count_na)

x0     0
x1     0
x2     0
x3     0
x4     0
x5     0
x6     0
x7     0
x8     0
x9     0
x10    0
x11    0
x12    0
x13    0
x14    0
x15    0
x16    0
x17    0
x18    0
x19    0
x20    0
x21    0
x22    0
x23    0
x25    0
x26    0
x27    0
x28    0
x31    0
x32    0
x33    0
x34    0
x35    0
x36    0
x37    0
x38    0
x39    0
x40    0
x41    0
x42    0
x43    0
x44    0
x45    0
x46    0
x47    0
x48    0
x49    0
x24    0
x29    0
x30    0
dtype: int64


One hot encoding the 'x24' that appears to be a continent, the x29 that is a month, and the x32 that is a weekday.

In [193]:
df_imputed.describe()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x43,x44,x45,x46,x47,x48,x49,x24,x29,x30
count,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,...,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000,160000,160000
unique,159975.0,159976.0,159963.0,159964.0,159975.0,159964.0,159975.0,159974.0,159980.0,159971.0,...,159964.0,159961.0,159972.0,159970.0,159964.0,159969.0,159969.0,3,12,5
top,-0.001028,0.001358,-1.150145,-0.024637,-0.000549,0.013582,-1.67067,-7.692795,-0.03054,0.005462,...,-0.002091,-0.00625,0.000885,-12.755395,0.028622,-0.000224,-0.674224,asia,July,wednesday
freq,26.0,25.0,38.0,37.0,26.0,37.0,26.0,27.0,21.0,30.0,...,37.0,40.0,29.0,31.0,37.0,32.0,32.0,138993,45599,101565


In [194]:
df = pd.get_dummies(df_imputed, columns=['x24', 'x29', 'x30'])

Prep the data for a cross val predict like prediction loop

In [209]:
df.dtypes

x0               object
x1               object
x2               object
x3               object
x4               object
                  ...  
x30_friday        uint8
x30_monday        uint8
x30_thurday       uint8
x30_tuesday       uint8
x30_wednesday     uint8
Length: 67, dtype: object

In [210]:
#split the label values into y and the features into X

X = df.values

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [211]:
# Setup some data in a train and val split to perform the search of the best model
#will circle back to using kfolds later

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=1234)

In [217]:
def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Flatten())
    # Tune the number of layers.
    for i in range(hp.Int("num_layers", 1, 5)):
        model.add(
            layers.Dense(
                # Tune number of units separately.
                units=hp.Int(f"units_{i}", min_value=32, max_value=512, step=32),
                activation='relu'),
            )
        
    if hp.Boolean("dropout"):
        model.add(layers.Dropout(rate=0.25))
    model.add(layers.Dense(1, activation="sigmoid"))
    learning_rate = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log")
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model


build_model(kt.HyperParameters())

<keras.engine.sequential.Sequential at 0x2684616ae20>

In [218]:
#build a keras_tuner RandomSearch to tune the training process and optimize the Neural Network
# https://www.tensorflow.org/tutorials/keras/keras_tuner
tuner = kt.Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=100,
    factor=3,
    hyperband_iterations=5,
    seed=1234,
    directory='hp_tuning',
    project_name='CaseStudy7_Run_1',
    overwrite=False,
    seed=1234
)
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [235]:
tuner.search_space_summary()

Search space summary
Default search space size: 8
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 1, 'max_value': 5, 'step': 1, 'sampling': 'linear'}
units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
dropout (Boolean)
{'default': False, 'conditions': []}
lr (Float)
{'default': 0.0001, 'conditions': [], 'min_value': 0.0001, 'max_value': 0.01, 'step': None, 'sampling': 'log'}
units_1 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
units_2 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
units_3 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}
units_4 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 512, 'step': 32, 'sampling': 'linear'}


In [219]:
#train the DNN with a hyper parameter search
tuner.search(X_train, y_train,
             epochs=50,
             validation_data=(X_val, y_val),
             callbacks=[es_callback])

#get the best hyperparameters and store them in a var
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

Trial 1270 Complete [00h 01m 24s]
val_accuracy: 0.9593750238418579

Best val_accuracy So Far: 0.9731249809265137
Total elapsed time: 12h 52m 08s
INFO:tensorflow:Oracle triggered exit


#### Preparing a  K Fold split
The best hyperparameters from the Hyperband tuning algorithm will be used to predict all 160,000 predictions, with models training on 143,999 datapoints using a 10 k fold split to where a model will train on that fold's training data, and then predictions made on the test set.  All test sets predictions will be concatenated into a flat array of predictions and scored for accuracy against the true values.

In [233]:
best_model = tuner.hypermodel.build(best_hps)
best_model.build(input_shape=(None, 28,28))
best_model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_3 (Flatten)         (None, 784)               0         
                                                                 
 dense_9 (Dense)             (None, 32)                25120     
                                                                 
 dense_10 (Dense)            (None, 448)               14784     
                                                                 
 dropout_3 (Dropout)         (None, 448)               0         
                                                                 
 dense_11 (Dense)            (None, 1)                 449       
                                                                 
Total params: 40,353
Trainable params: 40,353
Non-trainable params: 0
_________________________________________________________________


In [240]:
kf = KFold(n_splits=10)
best_model = tuner.hypermodel.build(best_hps)
indices = kf.split(X,y)
for train_index, test_index in indices:
    print(f"train: {train_index}")
    print(f"test: {test_index}")

train: [ 16000  16001  16002 ... 159997 159998 159999]
test: [    0     1     2 ... 15997 15998 15999]
train: [     0      1      2 ... 159997 159998 159999]
test: [16000 16001 16002 ... 31997 31998 31999]
train: [     0      1      2 ... 159997 159998 159999]
test: [32000 32001 32002 ... 47997 47998 47999]
train: [     0      1      2 ... 159997 159998 159999]
test: [48000 48001 48002 ... 63997 63998 63999]
train: [     0      1      2 ... 159997 159998 159999]
test: [64000 64001 64002 ... 79997 79998 79999]
train: [     0      1      2 ... 159997 159998 159999]
test: [80000 80001 80002 ... 95997 95998 95999]
train: [     0      1      2 ... 159997 159998 159999]
test: [ 96000  96001  96002 ... 111997 111998 111999]
train: [     0      1      2 ... 159997 159998 159999]
test: [112000 112001 112002 ... 127997 127998 127999]
train: [     0      1      2 ... 159997 159998 159999]
test: [128000 128001 128002 ... 143997 143998 143999]
train: [     0      1      2 ... 143997 143998 143999]


In [250]:

predictions = []
i = 1
for train_index, test_index in kf.split(X):
    best_model = tuner.hypermodel.build(best_hps) #have to create a new instance of the model to ensure  it doesn't reload anything that it was previously trained on
    X_train_val, X_test = X[train_index], X[test_index]
    y_train_val, y_test = y[train_index], y[test_index]

    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=.15, random_state=1234)
    best_model.fit(X_train, y_train, epochs=100, validation_data=(X_val, y_val), callbacks=[es_callback])
    y_prob = best_model.predict(X_test)
    y_pred = (y_prob > 0.5).astype("int32")
    predictions.append(y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    print(f"Iteration {i}")
    print(f"Accuracy {accuracy}")
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)
    print("Confusion Matrix: \n", confusion)
    i = i+1
    

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Iteration 1
Precision:  0.9614547715262667
Recall:  0.9566965666563563
F1 Score:  0.9590697674418605
Confusion Matrix: 
 [[9286  248]
 [ 280 6186]]
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Iteration 2
Precision:  0.9673119644557283
Recall:  0.9525
F1 Score:  0.95984

##### Final Scoring of the Neural Network Approach

In [267]:
full_predictions = np.concatenate(predictions)
accuracy = accuracy_score(y, full_predictions)
precision = precision_score(y, full_predictions)
recall = recall_score(y, full_predictions)
f1 = f1_score(y, full_predictions)
confusion = confusion_matrix(y, full_predictions)
print(f"Accuracy {accuracy}")
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)
print("Confusion Matrix: \n", confusion)
cost = (confusion[0,1] * 100) + (confusion[1,0]*150) 
print(f"Final Cost: $ {'{:,.2f}'.format(cost)}")

Accuracy 0.96711875
Precision:  0.9639753117521098
Recall:  0.9536894247394738
F1 Score:  0.9588047827482793
Confusion Matrix: 
 [[93515  2288]
 [ 2973 61224]]
Final Cost: $ 674,750.00


In [271]:
print(best_hps.values)

{'num_layers': 2, 'units_0': 32, 'dropout': True, 'lr': 0.0009259685801345267, 'units_1': 448, 'units_2': 224, 'units_3': 288, 'units_4': 288, 'tuner/epochs': 100, 'tuner/initial_epoch': 34, 'tuner/bracket': 2, 'tuner/round': 2, 'tuner/trial_id': '0737'}
