# TODO:
* Tune keras just on vanilla dataset and see what's the best score we get
* Add original data to the mix with training on both datasets but validation only on competition dataset and see how good it performs
* Do feature engineering based on time feature and see how good that performs

* **Drop the time feature, but used the newly engineered columns. Don't one hot encode the hour feature, rather use LeaveOneOutEncoding**

**Also, Anytime i refer to a book, I'm talking about this one**
https://fraud-detection-handbook.github.io/fraud-detection-handbook/Chapter_4_PerformanceMetrics/Introduction.html

# Important Note:
Now that I think about it, if we want to take the time features into account, we'll have to split the dataset for training and validation such that the validation comes after the training.
So i don't think we need Cross Validation for this one... Let's see

# Imports

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
from pathlib import Path
import xgboost as xgb
import lightgbm as lgbm
import catboost
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import roc_auc_score
from IPython.display import display
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
import optuna
from sklearn.preprocessing import StandardScaler
from scipy.linalg import norm

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras_tuner as kt

from category_encoders import LeaveOneOutEncoder

from IPython.display import display
from tqdm.notebook import tqdm

# Loading Data

In [2]:
BASE_PATH = Path("/kaggle/input/playground-series-s3e4/")

train = pd.read_csv(BASE_PATH / "train.csv")
test = pd.read_csv(BASE_PATH / "test.csv")

original = pd.read_csv("/kaggle/input/creditcardfraud/creditcard.csv")

print(f"Training dataset has {len(train)} rows with " \
                    f"{sum(train.Class) / len(train) * 100 :.2}% fraud rows.")
# print(f"Original dataset has {len(original)} rows with " \
#                     f"{sum(original.Class) / len(original) * 100 :.2}% fraud rows.")

Training dataset has 219129 rows with 0.21% fraud rows.


# Feature Engineering

In [3]:
train["Hour"] = ((train.Time // 3600) % 24).astype("int")
test["Hour"] = ((test.Time // 3600) % 24).astype("int")
# original["Hour"] = ((original.Time // 3600) % 24).astype("category")

In [4]:
def time_of_day(hour):
    if hour >=0 and hour <= 6:
        return "night"
    elif hour <= 12:
        return "morning"
    elif hour <= 18:
        return "afternoon"
    else:
        return "evening"

In [5]:
train.Hour.map(time_of_day).head()

0    night
1    night
2    night
3    night
4    night
Name: Hour, dtype: object

In [6]:
train["TimeOfDay"] = train.Hour.map(time_of_day)
test["TimeOfDay"] = test.Hour.map(time_of_day)

In [7]:
train["Day"] = ((train['Time'] // (24 * 3600)) % 7).astype("category")
test["Day"] = ((test["Time"] // (24 * 3600)) % 7).astype("category")

In [8]:
train.head()

Unnamed: 0,id,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V24,V25,V26,V27,V28,Amount,Class,Hour,TimeOfDay,Day
0,0,0.0,2.074329,-0.129425,-1.137418,0.412846,-0.192638,-1.210144,0.110697,-0.263477,...,-0.110835,-0.291459,0.207733,-0.076576,-0.059577,1.98,0,0,night,0.0
1,1,0.0,1.998827,-1.250891,-0.520969,-0.894539,-1.122528,-0.270866,-1.029289,0.050198,...,-0.461928,-0.465491,-0.464655,-0.009413,-0.038238,84.0,0,0,night,0.0
2,2,0.0,0.091535,1.004517,-0.223445,-0.435249,0.667548,-0.988351,0.948146,-0.084789,...,0.951233,-0.506919,0.085046,0.224458,0.087356,2.69,0,0,night,0.0
3,3,0.0,1.979649,-0.184949,-1.064206,0.120125,-0.215238,-0.648829,-0.087826,-0.035367,...,-0.042939,0.000799,-0.096148,-0.05778,-0.073839,1.0,0,0,night,0.0
4,4,0.0,1.025898,-0.171827,1.203717,1.2439,-0.636572,1.099074,-0.938651,0.569239,...,-0.262813,0.257834,-0.252829,0.108338,0.021051,1.0,0,0,night,0.0


In [9]:
# train["is_night"] = train.Hour.map(lambda x: int(x <= 6))
# test["is_night"] = test.Hour.map(lambda x: int(x <= 6))
# original["is_night"] = test.Hour.map(lambda x: int(x <= 6))

# Dropping Unnecessary Columns and making training and test sets

In [10]:
X = train.drop(columns=["id", "Class"])
y = train.Class

In [11]:
X_test = test.drop(columns=["id"])

In [12]:
X.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Hour', 'TimeOfDay', 'Day'],
      dtype='object')

In [13]:
numerical_feats = list(set(X.columns) - set(["TimeOfDay", "Day", "Hour"]))

In [14]:
# encoding
feats_to_encode = ["TimeOfDay", "Day", "Hour"]
loo = LeaveOneOutEncoder()
loo.fit(X[feats_to_encode], y)
X[feats_to_encode] = loo.transform(X[feats_to_encode])
X_test[feats_to_encode] = loo.transform(X_test[feats_to_encode])

# scaling
feats_to_scale = numerical_feats
sc = StandardScaler()
sc.fit(X[feats_to_scale])
X[feats_to_scale] = sc.transform(X[feats_to_scale])
X_test[feats_to_scale] = sc.transform(X_test[feats_to_scale])

In [15]:
X.shape

(219129, 33)

### PREVIOUSLY we tried to do this:
Since this dataset contains a time axis, so we'll make sure we're validating on the future we'll use 20% of the data for validation

### BUT WHILE TRYING TO SLEEP LAST NIGHT, I REALIZED:
The dataset is not strictly time series dataset, WHY? Well the chances of the next transaction being independent of the previous one are far far greater than it being dependent. WHY? We'll we don't have terminal (ATMs, & other payment equipment) IDs or Customer IDs. And only if we had those, we could've grouped them together and then those trasactions would've been dependent on the previous one!

# TODOs:
1. Try random shuffle splitting for train & val
2. Try splitting dataset based on time, like train on past and predict on future

In [16]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=1337, stratify=y)

In [17]:
X_train.shape

(175303, 33)

# Cross Validate

# Building Keras Model

In [128]:
tf.random.set_seed(1337)


def residual_block(x, units, depth=2, dropout_rate=None):
    residual = x
    for _ in range(depth):
        x = layers.Dense(units)(x)
        x = layers.BatchNormalization()(x)
        x = layers.Activation("relu")(x)
        if dropout_rate:
            x = layers.Dropout(dropout_rate)(x)
        
    return layers.add([x, residual])

def build_model():
    
    inputs = layers.Input(shape=(33,))

    x = layers.Dense(512)(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)
    x = layers.Dropout(0.5)(x)
    x = residual_block(x, units=512, depth=3, dropout_rate=0.5)

    x = layers.Dense(256)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)
    x = layers.Dropout(0.5)(x)
    x = residual_block(x, units=256, depth=3, dropout_rate=0.5)

    x = layers.Dense(128)(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)
    x = layers.Dropout(0.5)(x)
    x = residual_block(x, units=128, depth=3, dropout_rate=0.5)
    
#     x = layers.Dense(64)(inputs)
#     x = layers.BatchNormalization()(x)
#     x = layers.Activation("relu")(x)
#     x = layers.Dropout(0.3)(x)
# #     x = residual_block(x, units=64, depth=2)

    x = layers.Dense(128)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)
    x = layers.Dropout(0.3)(x)
    # x = residual_block(x, units=32, dropout_rate=0.1)

    outputs = layers.Dense(1, activation="sigmoid")(x)

    keras_model = keras.Model(inputs=inputs, outputs=outputs)

    optim = keras.optimizers.Adam(learning_rate=0.01)

    keras_model.compile(optimizer=optim,
                       loss=keras.losses.binary_crossentropy,
                       metrics=[keras.metrics.AUC(name="auc")])
    
    return keras_model

In [129]:
## CALL BACKS :O
# early_stopping = keras.callbacks.EarlyStopping(
#                 patience=20,
#                 min_delta=0.001,
#                 monitor="val_auc",
#                 restore_best_weights=True,
#                 )

reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5,
                              patience=5, min_lr=0.001)

In [131]:
keras_model = build_model()
keras_model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                batch_size=8192, epochs=50, callbacks=[reduce_lr])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7f074365c9d0>

# Cross Validating Keras

In [136]:
def cross_validate(X, y):
    N_FOLDS = 8
    
    skf = StratifiedKFold(n_splits=N_FOLDS, random_state=1337, shuffle=True)
    cv_scores = np.zeros(N_FOLDS)
    
    for fold_id, (train_idx, val_idx) in tqdm(enumerate(skf.split(X, y)), total=N_FOLDS):
        model = build_model()
        
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        print("TRAIN SET: ", X_tr.TimeOfDay.value_counts())
        print("VALIDATION SET: ", X_val.TimeOfDay.value_counts())
        
        model.fit(X_tr, y_tr, validation_data=(X_val, y_val), 
                        batch_size=8192, epochs=10, callbacks=[reduce_lr], verbose=0)        
        y_pred = model.predict(X_val)[:, 0]
        
        auc = roc_auc_score(y_val, y_pred)
        
        print(f"Fold: {fold_id} \t | \t auc: {auc}")
        
        cv_scores[fold_id] = auc
    
    avg_auc = np.mean(cv_scores)
    print(f"AVG AUC: {avg_auc}")

In [137]:
# we recompile and then call the keras model
cross_validate(X, y)

  0%|          | 0/8 [00:00<?, ?it/s]

TRAIN SET:  0.001637    67409
0.002286    56713
0.001580    40316
0.003905    27299
Name: TimeOfDay, dtype: int64
VALIDATION SET:  0.001637    9551
0.002286    8018
0.001580    5879
0.003905    3944
Name: TimeOfDay, dtype: int64
Fold: 0 	 | 	 auc: 0.7521398049294112
TRAIN SET:  0.001637    67457
0.002286    56497
0.001580    40527
0.003905    27257
Name: TimeOfDay, dtype: int64
VALIDATION SET:  0.001637    9503
0.002286    8234
0.001580    5668
0.003905    3986
Name: TimeOfDay, dtype: int64
Fold: 1 	 | 	 auc: 0.8005366760149726
TRAIN SET:  0.001637    67292
0.002286    56690
0.001580    40396
0.003905    27360
Name: TimeOfDay, dtype: int64
VALIDATION SET:  0.001637    9668
0.002286    8041
0.001580    5799
0.003905    3883
Name: TimeOfDay, dtype: int64
Fold: 2 	 | 	 auc: 0.7819529758773341
TRAIN SET:  0.001637    67365
0.002286    56721
0.001580    40374
0.003905    27278
Name: TimeOfDay, dtype: int64
VALIDATION SET:  0.001637    9595
0.002286    8010
0.001580    5821
0.003905    3965


# FineTuning Keras

In [20]:
# def model_builder(hp):
#     inputs = layers.Input(shape=(30,))
    
#     hp_units_1 = hp.Int("units_1", min_value=128, max_value=1024, step=128)
#     x = layers.Dense(hp_units_1, activation="relu")(inputs)
#     x = layers.BatchNormalization()(x)
#     hp_dropout_1 = hp.Float("dropout_1", min_value=0, max_value=0.8, step=0.1)
#     x = layers.Dropout(hp_dropout_1)(x)
    
#     hp_units_2 = hp.Int("units_2", min_value=64, max_value=512, step=64)
#     x = layers.Dense(hp_units_2, activation="relu")(x)
#     x = layers.BatchNormalization()(x)
#     hp_dropout_2 = hp.Float("dropout_2", min_value=0, max_value=0.8, step=0.1)
#     x = layers.Dropout(hp_dropout_2)(x)
    
#     hp_units_3 = hp.Int("units_3", min_value=32, max_value=256, step=32)
#     x = layers.Dense(hp_units_3, activation="relu")(x)
#     x = layers.BatchNormalization()(x)
#     hp_dropout_3 = hp.Float("dropout_3", min_value=0, max_value=0.8, step=0.1)
#     x = layers.Dropout(hp_dropout_3)(x)
    
#     hp_units_4 = hp.Int("units_4", min_value=32, max_value=256, step=32)
#     x = layers.Dense(hp_units_4, activation="relu")(x)
#     x = layers.BatchNormalization()(x)
#     hp_dropout_4 = hp.Float("dropout_4", min_value=0, max_value=0.8, step=0.1)
#     x = layers.Dropout(hp_dropout_4)(x)
    
#     hp_units_5 = hp.Int("units_5", min_value=16, max_value=128, step=16)
#     x = layers.Dense(hp_units_5, activation="relu")(x)
#     x = layers.BatchNormalization()(x)
#     hp_dropout_5 = hp.Float("dropout_5", min_value=0, max_value=0.8, step=0.1)
#     x = layers.Dropout(hp_dropout_5)(x)

#     outputs = layers.Dense(1, activation="sigmoid")(x)

#     keras_model = keras.Model(inputs=inputs, outputs=outputs)
    
#     hp_learning_rate = hp.Float("learning_rate", min_value=1e-05, max_value=1e-1, sampling="log")
#     hp_optimizer = hp.Choice("optimizer", ["rmsprop", "adam"])
    
#     if hp_optimizer == "adam":
#         optim = keras.optimizers.Adam(learning_rate=hp_learning_rate)
#     else:
#         optim = keras.optimizers.RMSprop(learning_rate=hp_learning_rate)
        
#     keras_model.compile(optimizer=optim,
#                        loss=keras.losses.binary_crossentropy,
#                        metrics=[keras.metrics.AUC()])
    
#     return keras_model

In [21]:
early_stopping = keras.callbacks.EarlyStopping(
                patience=5,
                min_delta=0.001,
                monitor="val_auc",
                restore_best_weights=True,
                )

reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=5, min_lr=0.001)

In [22]:
tuner = kt.Hyperband(model_builder,
                    objective=kt.Objective("val_auc", direction="max"),
                    max_epochs=50,
                    directory="./",
                    project_name="tuning_keras",
                    overwrite=True,
                    )

In [23]:
tuner.search(X_train, y_train, epochs=50, validation_data=(X_val, y_val), batch_size=1024, callbacks=[early_stopping, reduce_lr])

Trial 90 Complete [00h 00m 24s]
val_auc: 0.4510006308555603

Best val_auc So Far: 0.7841417789459229
Total elapsed time: 00h 17m 33s


In [None]:
# best_hps_list = tuner.get_best_hyperparameters(num_trials=5)

In [None]:
# for i, best_hps in enumerate(best_hps_list):
#     print(f"{'-'*15} {i} {'-'*15}")
#     print(best_hps.values)

# FineTuned Keras

In [None]:
inputs = layers.Input(shape=(55,))

x = layers.Dense(1024, activation="relu")(inputs)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.8)(x)

x = layers.Dense(512, activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.1)(x)

x = layers.Dense(448, activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.6)(x)

x = layers.Dense(160, activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.7)(x)

x = layers.Dense(128, activation="relu")(x)
x = layers.BatchNormalization()(x)
x = layers.Dropout(0.3)(x)

outputs = layers.Dense(1, activation="sigmoid")(x)

keras_model = keras.Model(inputs=inputs, outputs=outputs)

optim = keras.optimizers.Adam(learning_rate=0.053)

keras_model.compile(optimizer=optim,
                   loss=keras.losses.binary_crossentropy,
                   metrics=[keras.metrics.AUC()])

In [None]:
keras_model.fit(X_train, y_train, validation_data=(X_val, y_val), 
                batch_size=1024, callbacks=[early_stopping, reduce_lr],
               epochs=50)

In [None]:
X_test = pd.get_dummies(X_test)

In [None]:
keras_model.predict(X_test)