### Lil bit of data prep

Although we can do these transformations as part of each modelling pipeline, for simplicity we perform them in bulk now. 

In [1]:
import pandas as pd

In [2]:
# Get the raw data
raw_data = pd.read_csv('creditcard.csv')

In [3]:
raw_data.head(3)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0


We know from the dataset [kaggle page](https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud/):

"It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, … V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-sensitive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise."

In [4]:
# Checking distributions
raw_data.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.168375e-15,3.416908e-16,-1.379537e-15,2.074095e-15,9.604066e-16,1.487313e-15,-5.556467e-16,1.213481e-16,-2.406331e-15,...,1.654067e-16,-3.568593e-16,2.578648e-16,4.473266e-15,5.340915e-16,1.683437e-15,-3.660091e-16,-1.22739e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


In [5]:
raw_data.isna().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

This dataset looks pretty good. For simplicity (we don't really care about _absolute_ model formance here) we can drop the Amount and Time columns to be left with only the PCA/transformed/anaonymised VX columns.

In [6]:
model_data = raw_data.drop(['Amount', 'Time'], axis=1)

In [7]:
model_data.head(3)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0


In [8]:
# Split to train and test
split_ratio = 0.7  # train/test split
index_train = round(len(model_data)*split_ratio)  # get index value to split data on
train_data = model_data[:index_train]  # define training data
test_data = model_data[index_train:]  # define test data

### XGBoost

In [9]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score
import matplotlib

%matplotlib inline

In [10]:
# Set parameters
params = {
    "objective": "binary:logistic",  # binary classification problem
    "device": "cpu",  # no GPU sadly :(
    "verbosity": 2,  # increase info in output
    "eta": 0.1,  # lower learning rate to help prevent overfit
    "max_depth": 4,  # reduce to prevent overfit (and my machine from blowing up)
    "eval_metric": "auc"  # severely unbalanced dataset, use AUCROC
}

In [11]:
# Get validation set
split_ratio = 0.7  # train/test split
index_train_val = round(len(train_data)*split_ratio)  # get index value to split data on
train_val_data = train_data[:index_train_val]  # define training data
val_data = train_data[index_train_val:]  # define test data

In [12]:
# Create training and validation matrices
xgb_train = xgb.DMatrix(
    data=train_val_data.drop(['Class'], axis=1),
    label=train_val_data['Class']
)
xgb_val = xgb.DMatrix(
    data=val_data.drop(['Class'], axis=1),
    label=val_data['Class']
) 

In [13]:
# Set watchlist for early stopping
evallist = [(xgb_train, 'train'), (xgb_val, 'eval')]

In [14]:
# Train a model
xgb_model = xgb.train(
    params=params,
    dtrain=xgb_train,
    num_boost_round=100,
    evals=evallist,
    early_stopping_rounds=10
)

[0]	train-auc:0.92960	eval-auc:0.92872
[1]	train-auc:0.92960	eval-auc:0.92872
[2]	train-auc:0.92961	eval-auc:0.92872
[3]	train-auc:0.92962	eval-auc:0.92872
[4]	train-auc:0.92962	eval-auc:0.92872
[5]	train-auc:0.92962	eval-auc:0.92872
[6]	train-auc:0.92962	eval-auc:0.92872
[7]	train-auc:0.92962	eval-auc:0.92871
[8]	train-auc:0.92962	eval-auc:0.92871
[9]	train-auc:0.92962	eval-auc:0.92873
[10]	train-auc:0.92962	eval-auc:0.92873
[11]	train-auc:0.92963	eval-auc:0.92873
[12]	train-auc:0.92963	eval-auc:0.92873
[13]	train-auc:0.92963	eval-auc:0.92873
[14]	train-auc:0.92963	eval-auc:0.92872
[15]	train-auc:0.92963	eval-auc:0.92873
[16]	train-auc:0.92963	eval-auc:0.92873
[17]	train-auc:0.96124	eval-auc:0.95152
[18]	train-auc:0.96200	eval-auc:0.95426
[19]	train-auc:0.96203	eval-auc:0.95425
[20]	train-auc:0.96205	eval-auc:0.95427
[21]	train-auc:0.96204	eval-auc:0.95438
[22]	train-auc:0.96205	eval-auc:0.95426
[23]	train-auc:0.96208	eval-auc:0.95431
[24]	train-auc:0.96208	eval-auc:0.95431
[25]	train

In [15]:
# Make predictions using test data
preds = xgb_model.predict(xgb.DMatrix(test_data.drop(['Class'], axis=1)))

In [16]:
# Get binary predictions
bin_preds = (preds > 0.5).astype(int)

In [17]:
# Evaluate predictions
roc_auc_score(test_data['Class'], bin_preds)

0.8841420726747795

### TensorFlow/Keras

In [18]:
import numpy as np

import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers




In [19]:
# A utility method to create a tf.data dataset from a Pandas DF
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('Class')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [20]:
# Add all numerical columns to feature_columns
feature_columns = []
# numeric cols
for header in list(train_val_data.drop(['Class'], axis=1).columns):
  feature_columns.append(feature_column.numeric_column(header))

Instructions for updating:
Use Keras preprocessing layers instead, either directly or via the `tf.keras.utils.FeatureSpace` utility. Each of `tf.feature_column.*` has a functional equivalent in `tf.keras.layers` for feature preprocessing when training a Keras model.


In [21]:
# Create train, val, test datasets
train_ds = df_to_dataset(train_val_data)
val_ds = df_to_dataset(val_data, shuffle=False)
test_ds = df_to_dataset(test_data, shuffle=False)

In [22]:
feature_columns

[NumericColumn(key='V1', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='V2', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='V3', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='V4', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='V5', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='V6', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='V7', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='V8', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='V9', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='V10', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None),
 NumericColumn(key='V11', shape=(1,), d

In [23]:
# Create dense layer from these input features
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)




In [24]:
# Create sequential model
model = tf.keras.Sequential(
    [
        feature_layer,  # defines the input layer and its dimensions
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(32, activation='relu'),
        layers.Dropout(0.1),
        layers.Dense(1, activation='sigmoid')  # sigmoid activation for binary classification
    ]
)
# Compile model 
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.AUC()]
)
# Define early stopping
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc',
    #min_delta=0.01,
    patience=10,
    verbose=1
)
# Fit to training/validation ds
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=100,
    callbacks=[early_stopping]
)


Epoch 1/100

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 13: early stopping


<keras.src.callbacks.History at 0x1b1ff9ab290>

In [26]:
loss, auc = model.evaluate(test_ds)
print(auc)

0.9116330146789551


### PyTorch