In [43]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
import keras_tuner as kt
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, accuracy_score


In [44]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print("REPLICAS: ", strategy.num_replicas_in_sync)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

REPLICAS:  1
Num GPUs Available:  1


In [45]:
df = pd.read_csv('final_project.csv')

In [46]:
df.dtypes

x0     float64
x1     float64
x2     float64
x3     float64
x4     float64
x5     float64
x6     float64
x7     float64
x8     float64
x9     float64
x10    float64
x11    float64
x12    float64
x13    float64
x14    float64
x15    float64
x16    float64
x17    float64
x18    float64
x19    float64
x20    float64
x21    float64
x22    float64
x23    float64
x24     object
x25    float64
x26    float64
x27    float64
x28    float64
x29     object
x30     object
x31    float64
x32     object
x33    float64
x34    float64
x35    float64
x36    float64
x37     object
x38    float64
x39    float64
x40    float64
x41    float64
x42    float64
x43    float64
x44    float64
x45    float64
x46    float64
x47    float64
x48    float64
x49    float64
y        int64
dtype: object

Check data quality

In [47]:
count_na = df.isna().sum()
print(count_na)
class_counts = df['y'].value_counts()
print(class_counts)

x0     26
x1     25
x2     38
x3     37
x4     26
x5     37
x6     26
x7     27
x8     21
x9     30
x10    43
x11    30
x12    36
x13    31
x14    34
x15    35
x16    26
x17    27
x18    40
x19    35
x20    38
x21    29
x22    27
x23    47
x24    28
x25    22
x26    36
x27    30
x28    35
x29    30
x30    30
x31    39
x32    31
x33    41
x34    41
x35    30
x36    27
x37    23
x38    31
x39    23
x40    36
x41    40
x42    26
x43    37
x44    40
x45    29
x46    31
x47    37
x48    32
x49    32
y       0
dtype: int64
0    95803
1    64197
Name: y, dtype: int64


In [91]:
color_list = ['#92B2F7', '#FA7D7A']
fig = go.Figure()
#create a horizontal bar for each class in the particle data.
for idx, (label, count) in enumerate(class_counts.items()):
    fig.add_trace(
        go.Bar(
            y=[str(label)],
            x=[count],
            name=str(label),
            orientation='h',
            text=[count],
            textposition='auto',
            marker_color=color_list[idx % len(color_list)]
        )
    )

fig.update_layout(
    title_text='Frequency of Negative and Positive Class',
    title_x=0.5, 
    xaxis_title='Frequency',
    yaxis_title='Label',
    showlegend = False,
    autosize=False,
    width=500,
    height=500,
    margin=dict(l=50, r=50, b=100, t=100, pad=4),
    plot_bgcolor='white', 
    xaxis=dict(gridcolor='lightgrey')
)

fig.show()

Handling the % in 'x32' by stripping the % and converting to float, then divide by 100

In [48]:
df['x32'] = df['x32'].str.replace('%', '')
df['x32'] = pd.to_numeric(df['x32']) / 100
df['x32']

0         0.0000
1        -0.0002
2        -0.0001
3         0.0001
4         0.0001
           ...  
159995    0.0000
159996   -0.0001
159997   -0.0000
159998   -0.0002
159999    0.0002
Name: x32, Length: 160000, dtype: float64

Handling column 'x37' by stripping the $ and converting it to a float64

In [49]:
df['x37'] = df['x37'].str.replace('$', '')
df['x37'] = pd.to_numeric(df['x37'])
df['x37']

  df['x37'] = df['x37'].str.replace('$', '')


0         1313.96
1         1962.78
2          430.47
3        -2366.29
4         -620.66
           ...   
159995    -891.96
159996    1588.65
159997     687.46
159998     439.21
159999   -1229.34
Name: x37, Length: 160000, dtype: float64

Imputing all of the missing data with either mean imputation for numerical or most frequent for category imputation.

In [50]:
num_cols = [f"x{i}" for i in range(50) if i not in [24, 29, 30]]
cat_cols = ["x24", "x29", "x30"]
#copy the target before doing the transform since it gets dropped
y = df['y'].values

num_imputer = SimpleImputer(strategy='mean')  
cat_imputer = SimpleImputer(strategy='most_frequent')

transformer = ColumnTransformer(
    transformers=[
        ('num_imputer', num_imputer, num_cols),
        ('cat_imputer', cat_imputer, cat_cols)
    ])

df_imputed = pd.DataFrame(transformer.fit_transform(df), columns=num_cols+cat_cols)
df_imputed.index = df.index

In [51]:
count_na = df_imputed.isna().sum()
print(count_na)

x0     0
x1     0
x2     0
x3     0
x4     0
x5     0
x6     0
x7     0
x8     0
x9     0
x10    0
x11    0
x12    0
x13    0
x14    0
x15    0
x16    0
x17    0
x18    0
x19    0
x20    0
x21    0
x22    0
x23    0
x25    0
x26    0
x27    0
x28    0
x31    0
x32    0
x33    0
x34    0
x35    0
x36    0
x37    0
x38    0
x39    0
x40    0
x41    0
x42    0
x43    0
x44    0
x45    0
x46    0
x47    0
x48    0
x49    0
x24    0
x29    0
x30    0
dtype: int64


One hot encoding the 'x24' that appears to be a continent, the x29 that is a month, and the x32 that is a weekday.

In [52]:
df_imputed.describe()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x43,x44,x45,x46,x47,x48,x49,x24,x29,x30
count,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,...,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000.0,160000,160000,160000
unique,159975.0,159976.0,159963.0,159964.0,159975.0,159964.0,159975.0,159974.0,159980.0,159971.0,...,159964.0,159961.0,159972.0,159970.0,159964.0,159969.0,159969.0,3,12,5
top,-0.001028,0.001358,-1.150145,-0.024637,-0.000549,0.013582,-1.67067,-7.692795,-0.03054,0.005462,...,-0.002091,-0.00625,0.000885,-12.755395,0.028622,-0.000224,-0.674224,asia,July,wednesday
freq,26.0,25.0,38.0,37.0,26.0,37.0,26.0,27.0,21.0,30.0,...,37.0,40.0,29.0,31.0,37.0,32.0,32.0,138993,45599,101565


In [53]:
df = pd.get_dummies(df_imputed, columns=['x24', 'x29', 'x30'])

Prep the data for a cross val predict like prediction loop

In [54]:
df.dtypes

x0               object
x1               object
x2               object
x3               object
x4               object
                  ...  
x30_friday        uint8
x30_monday        uint8
x30_thurday       uint8
x30_tuesday       uint8
x30_wednesday     uint8
Length: 67, dtype: object

In [55]:
#split the label values into y and the features into X

X = df.values

scaler = StandardScaler()
X = scaler.fit_transform(X)

In [56]:
# Setup some data in a train and val split to perform the search of the best model
#will circle back to using kfolds later

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=1234)

In [64]:

def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Flatten())
    # Tune the number of layers.
    for i in range(hp.Int("num_layers", 2, 6)):
        model.add(
            layers.Dense(
                # Tune number of units separately.
                units=hp.Int(f"units_{i}", min_value=32, max_value=768, step=32),
                activation=hp.Choice(f'activation_{i}', values=['relu', 'tanh'],),
                kernel_regularizer=keras.regularizers.l2(hp.Float(f'weight_decay_{i}', min_value=0, max_value=1e-2, sampling='linear')))
            )
        
    if hp.Boolean("dropout"):
        model.add(layers.Dropout(hp.Float(f'dropout_{i}', min_value=0.005, max_value=0.055, step=0.01)))
    model.add(layers.Dense(1, activation="sigmoid"))
    learning_rate = hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
        loss="binary_crossentropy",
        metrics=["accuracy"],
    )
    return model

build_model(kt.HyperParameters())

<keras.engine.sequential.Sequential at 0x1c722089070>

In [65]:
#build a keras_tuner RandomSearch to tune the training process and optimize the Neural Network
# https://www.tensorflow.org/tutorials/keras/keras_tuner
tuner = kt.Hyperband(
    build_model,
    objective='val_accuracy',
    max_epochs=50,
    factor=3,
    hyperband_iterations=5,
    seed=1234,
    directory='hp_tuning_5',
    project_name='CaseStudy7_Run_5',
    overwrite=True,
    seed=1234
)
es_callback = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3)

In [66]:
tuner.search_space_summary()

Search space summary
Default search space size: 9
num_layers (Int)
{'default': None, 'conditions': [], 'min_value': 2, 'max_value': 6, 'step': 1, 'sampling': 'linear'}
units_0 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 768, 'step': 32, 'sampling': 'linear'}
activation_0 (Choice)
{'default': 'relu', 'conditions': [], 'values': ['relu', 'tanh'], 'ordered': False}
weight_decay_0 (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.01, 'step': None, 'sampling': 'linear'}
units_1 (Int)
{'default': None, 'conditions': [], 'min_value': 32, 'max_value': 768, 'step': 32, 'sampling': 'linear'}
activation_1 (Choice)
{'default': 'relu', 'conditions': [], 'values': ['relu', 'tanh'], 'ordered': False}
weight_decay_1 (Float)
{'default': 0.0, 'conditions': [], 'min_value': 0.0, 'max_value': 0.01, 'step': None, 'sampling': 'linear'}
dropout (Boolean)
{'default': False, 'conditions': []}
learning_rate (Choice)
{'default': 0.01, 'conditions': [], 'value

In [67]:
#train the DNN with a hyper parameter search
tuner.search(X_train, y_train,
             epochs=50,
             validation_data=(X_val, y_val),
             callbacks=[es_callback])

#get the best hyperparameters and store them in a var
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

Trial 450 Complete [00h 03m 39s]
val_accuracy: 0.9669166803359985

Best val_accuracy So Far: 0.9729999899864197
Total elapsed time: 06h 35m 19s
INFO:tensorflow:Oracle triggered exit


#### Preparing a  K Fold split
The best hyperparameters from the Hyperband tuning algorithm will be used to predict all 160,000 predictions, with models training on 143,999 datapoints using a 10 k fold split to where a model will train on that fold's training data, and then predictions made on the test set.  All test sets predictions will be concatenated into a flat array of predictions and scored for accuracy against the true values.

In [68]:
best_model = tuner.hypermodel.build(best_hps)
best_model.build(input_shape=(None, 28,28))
best_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_1 (Flatten)         (None, 784)               0         
                                                                 
 dense_6 (Dense)             (None, 672)               527520    
                                                                 
 dense_7 (Dense)             (None, 128)               86144     
                                                                 
 dense_8 (Dense)             (None, 384)               49536     
                                                                 
 dense_9 (Dense)             (None, 32)                12320     
                                                                 
 dense_10 (Dense)            (None, 128)               4224      
                                                                 
 dropout (Dropout)           (None, 128)              

In [84]:
kf = KFold(n_splits=3)
best_model = tuner.hypermodel.build(best_hps)
indices = kf.split(X,y)
for train_index, test_index in indices:
    print(f"train: {train_index}")
    print(f"test: {test_index}")

train: [ 53334  53335  53336 ... 159997 159998 159999]
test: [    0     1     2 ... 53331 53332 53333]
train: [     0      1      2 ... 159997 159998 159999]
test: [ 53334  53335  53336 ... 106664 106665 106666]
train: [     0      1      2 ... 106664 106665 106666]
test: [106667 106668 106669 ... 159997 159998 159999]


In [85]:
es_callback_final = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5)

In [86]:
probabilities = []
predictions = []
i = 1
for train_index, test_index in kf.split(X):
    best_model = tuner.hypermodel.build(best_hps) #have to create a new instance of the model to ensure  it doesn't reload anything that it was previously trained on
    X_train_val, X_test = X[train_index], X[test_index]
    y_train_val, y_test = y[train_index], y[test_index]

    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=.15, random_state=1234)
    best_model.fit(X_train, y_train, epochs=100, validation_data=(X_val, y_val), callbacks=[es_callback_final])
    y_prob = best_model.predict(X_test)
    y_pred = (y_prob > 0.5).astype("int32")
    probabilities.append(y_prob)
    predictions.append(y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    print(f"Iteration {i}")
    print(f"Accuracy {accuracy}")
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F1 Score: ", f1)
    print("Confusion Matrix: \n", confusion)
    i = i+1
    

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Iteration 1
Accuracy 0.9705628679641505
Precision:  0.9614548146777104
Recall:  0.965744829188938
F1 Score:  0.9635950470713723
Confusion Matrix: 
 [[30986   833]
 [  737 20778]]
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Iteration 2
Accuracy 0.9695685598034988
Precision:  0.9624900239425379
R

##### Final Scoring of the Neural Network Approach

In [95]:
full_predictions = np.concatenate(predictions)
accuracy = accuracy_score(y, full_predictions)
precision = precision_score(y, full_predictions)
recall = recall_score(y, full_predictions)
f1 = f1_score(y, full_predictions)
confusion = confusion_matrix(y, full_predictions)
print(f"Accuracy {accuracy}")
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)
print("Confusion Matrix: \n", confusion)
cost = (confusion[0,1] * 150) + (confusion[1,0]*100) 
print(f"Final Cost: $ {'{:,.2f}'.format(cost)}")

Accuracy 0.97004375
Precision:  0.9610966219572777
Recall:  0.9643752823340654
F1 Score:  0.9627331607224776
Confusion Matrix: 
 [[93297  2506]
 [ 2287 61910]]
Final Cost: $ 604,600.00


Adjusting for the higher cost of a FN

In [97]:
costs = []
for p in np.arange(.70, .2, -.01):  
    full_probs = np.concatenate(probabilities)
    full_predictions = (full_probs > p).astype("int32")
    accuracy = accuracy_score(y, full_predictions)
    precision = precision_score(y, full_predictions)
    recall = recall_score(y, full_predictions)
    f1 = f1_score(y, full_predictions)
    confusion = confusion_matrix(y, full_predictions)
    print(f"Accuracy {accuracy}")
    #print("Precision: ", precision)
    print("Recall: ", recall)
    #print("F1 Score: ", f1)
    print("Confusion Matrix: \n", confusion)
    cost = (confusion[0,1] * 150) + (confusion[1,0]*100)
    metrics = {
        'p': p,
        'cost': cost,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion': confusion
    }
    costs.append(metrics)  # append the metrics dictionary
    print(f"Final Cost at {p}: $ {'{:,.2f}'.format(cost)}")

# Sort costs list by 'cost' key and get the dictionary with the minimum cost.
min_cost = min(costs, key=lambda x: x['cost'])

print(f"The minimum cost is {min_cost['cost']} at threshold {min_cost['p']}.")
print(f"Corresponding metrics: Accuracy: {min_cost['accuracy']}, Precision: {min_cost['precision']}, Recall: {min_cost['recall']}, F1 Score: {min_cost['f1']}")

Accuracy 0.9676125
Recall:  0.9406670093618081
Confusion Matrix: 
 [[94430  1373]
 [ 3809 60388]]
Final Cost at 0.7: $ 586,850.00
Accuracy 0.96799375
Recall:  0.9425518326401545
Confusion Matrix: 
 [[94370  1433]
 [ 3688 60509]]
Final Cost at 0.69: $ 583,750.00
Accuracy 0.96839375
Recall:  0.9442653083477421
Confusion Matrix: 
 [[94324  1479]
 [ 3578 60619]]
Final Cost at 0.6799999999999999: $ 579,650.00
Accuracy 0.96860625
Recall:  0.9457451282770223
Confusion Matrix: 
 [[94263  1540]
 [ 3483 60714]]
Final Cost at 0.6699999999999999: $ 579,300.00
Accuracy 0.9687625
Recall:  0.9470536006355437
Confusion Matrix: 
 [[94204  1599]
 [ 3399 60798]]
Final Cost at 0.6599999999999999: $ 579,750.00
Accuracy 0.968975
Recall:  0.9483153418384037
Confusion Matrix: 
 [[94157  1646]
 [ 3318 60879]]
Final Cost at 0.6499999999999999: $ 578,700.00
Accuracy 0.96925625
Recall:  0.9498263158714582
Confusion Matrix: 
 [[94105  1698]
 [ 3221 60976]]
Final Cost at 0.6399999999999999: $ 576,800.00
Accuracy 0.

In [96]:
import plotly.express as px
import numpy as np

def plot_confusion_matrix(values, title="Confusion Matrix"):
    TP, FP, FN, TN = values

    matrix = np.array([
        [TP, FN],
        [FP, TN]
    ])
    x_labels = ["Predicted Positive", "Predicted Negative"]
    y_labels = ["Actual Positive", "Actual Negative"]

    fig = px.imshow(matrix, labels=dict(x="Predicted Values", y="Actual Values", color="Count"),
                    x=x_labels, y=y_labels, color_continuous_scale="blues")

    fig.update_layout(title_text=title, title_x=0.5)
    for i, row in enumerate(matrix):
        for j, value in enumerate(row):
            fig.add_annotation(dict(
                x=j, y=i,
                text=str(value),
                showarrow=False,
                font_size=16,
                opacity=0.7,
                font_color='black'
            ))
    fig.show()

values = [94062, 1787, 3076, 61121]  # Example values (replace with your own)
plot_confusion_matrix(values, title="Neural Network Confusion Matrix")

In [98]:
print(best_hps.values)

{'num_layers': 5, 'units_0': 672, 'activation_0': 'tanh', 'weight_decay_0': 0.006957259092226169, 'units_1': 128, 'activation_1': 'relu', 'weight_decay_1': 0.005826016708182784, 'dropout': True, 'learning_rate': 0.0001, 'units_2': 384, 'activation_2': 'relu', 'weight_decay_2': 0.0004575703546504062, 'units_3': 32, 'activation_3': 'relu', 'weight_decay_3': 0.004430085324964402, 'units_4': 128, 'activation_4': 'relu', 'weight_decay_4': 0.0002765893495990468, 'units_5': 704, 'activation_5': 'tanh', 'weight_decay_5': 0.005206724153130433, 'dropout_5': 0.045, 'dropout_3': 0.055, 'dropout_1': 0.045, 'dropout_4': 0.015, 'dropout_2': 0.015, 'tuner/epochs': 50, 'tuner/initial_epoch': 17, 'tuner/bracket': 3, 'tuner/round': 3, 'tuner/trial_id': '0316'}
