# Initial models and evaluation

In [49]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.svm import LinearSVC

In [2]:
data = pd.read_csv("../data/processed/standardized_creditcard.csv")

In [16]:
train, test, val = data[:240000], data[240000:262000], data[262000:]

In [17]:
x_train, y_train = train.to_numpy()[:, :-1], train.to_numpy()[:, -1]
x_test, y_test = test.to_numpy()[:, :-1], test.to_numpy()[:, -1]
x_val, y_val = val.to_numpy()[:, :-1], val.to_numpy()[:, -1]

In [19]:
val.shape

(22807, 32)

## Logistic Regression

In [20]:
logistic_model = LogisticRegression()
logistic_model.fit(x_train, y_train)
logistic_model.score(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9988875

In [24]:
print(classification_report(y_val, logistic_model.predict(x_val), target_names=["Legitimate","Fraudulent"]))

              precision    recall  f1-score   support

  Legitimate       1.00      1.00      1.00     22789
  Fraudulent       0.80      0.22      0.35        18

    accuracy                           1.00     22807
   macro avg       0.90      0.61      0.67     22807
weighted avg       1.00      1.00      1.00     22807



In [36]:
shallow_nn = Sequential()
shallow_nn.add(InputLayer((x_train.shape[1],)))
shallow_nn.add(Dense(2, 'relu'))
shallow_nn.add(BatchNormalization())
shallow_nn.add(Dense(1, 'sigmoid'))

checkpoint = ModelCheckpoint('shallow_nn', save_best_only=True)
shallow_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics='accuracy')

In [37]:
shallow_nn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 2)                 64        
                                                                 
 batch_normalization_2 (Bat  (None, 2)                 8         
 chNormalization)                                                
                                                                 
 dense_5 (Dense)             (None, 1)                 3         
                                                                 
Total params: 75 (300.00 Byte)
Trainable params: 71 (284.00 Byte)
Non-trainable params: 4 (16.00 Byte)
_________________________________________________________________


In [38]:
shallow_nn.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, callbacks=checkpoint)

Epoch 1/5


INFO:tensorflow:Assets written to: shallow_nn\assets


Epoch 2/5


INFO:tensorflow:Assets written to: shallow_nn\assets


Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x28d7c78d310>

In [39]:
def neural_net_predictions(model, x):
    return (model.predict(x).flatten() > 0.5).astype(int)
neural_net_predictions(shallow_nn, x_val)




array([0, 0, 0, ..., 0, 0, 0])

In [40]:
print(classification_report(y_val, neural_net_predictions(shallow_nn, x_val), target_names=["Legitimate","Fraudulent"]))

              precision    recall  f1-score   support

  Legitimate       1.00      1.00      1.00     21963
  Fraudulent       0.00      0.00      0.00        37

    accuracy                           1.00     22000
   macro avg       0.50      0.50      0.50     22000
weighted avg       1.00      1.00      1.00     22000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Random Forest Classifier

In [44]:
rf = RandomForestClassifier(max_depth=2, n_jobs=-1)
rf.fit(x_train, y_train)

In [46]:

print(classification_report(y_val, rf.predict(x_val), target_names=["Legitimate","Fraudulent"]))

              precision    recall  f1-score   support

  Legitimate       1.00      1.00      1.00     22789
  Fraudulent       0.89      0.44      0.59        18

    accuracy                           1.00     22807
   macro avg       0.94      0.72      0.80     22807
weighted avg       1.00      1.00      1.00     22807



## Gradient Boosting Classifier

In [51]:
gbc = GradientBoostingClassifier(n_estimators=50, learning_rate=10, max_depth=1, random_state=0)
gbc.fit(x_train, y_train)
print(classification_report(y_test, gbc.predict(x_test), target_names=["Legitimate","Fraudulent"]))

              precision    recall  f1-score   support

  Legitimate       0.00      0.00      0.00     21963
  Fraudulent       0.00      1.00      0.00        37

    accuracy                           0.00     22000
   macro avg       0.00      0.50      0.00     22000
weighted avg       0.00      0.00      0.00     22000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Linear Support Vector Machin (SVC)

In [52]:
svc = LinearSVC(class_weight='balanced')
svc.fit(x_train, y_train)

print(classification_report(y_test, svc.predict(x_test), target_names=["Legitimate","Fraudulent"]))



              precision    recall  f1-score   support

  Legitimate       1.00      1.00      1.00     21963
  Fraudulent       0.00      0.00      0.00        37

    accuracy                           1.00     22000
   macro avg       0.50      0.50      0.50     22000
weighted avg       1.00      1.00      1.00     22000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [53]:
legits = data.query('Class == 0')
frauds = data.query('Class == 1')

In [55]:
legits['Class'].value_counts(), frauds['Class'].value_counts()

(0    284315
 Name: Class, dtype: int64,
 1    492
 Name: Class, dtype: int64)

In [63]:
balanced_df = pd.concat([frauds, legits.sample(len(frauds), random_state=1)])

In [64]:
balanced_df.shape

(984, 32)

In [58]:
balanced_df = balanced_df.sample(frac=1, random_state=1)

In [59]:
balanced_df

Unnamed: 0.1,Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
189959,189959,0.744404,-0.865285,-0.979506,2.587540,-2.781144,-0.887336,-0.579689,-0.976755,0.132058,...,-0.106978,-0.010528,-0.211955,0.021026,0.358237,-0.209483,0.062051,0.074730,-0.195626,0
107637,107637,0.408213,-2.271755,-0.457655,-2.589055,2.230778,-4.278983,0.388610,0.102485,0.813128,...,1.096342,0.658399,1.711676,0.333540,0.538591,-0.193529,0.258194,0.247269,11.218193,1
275992,275992,0.965502,-2.027135,-1.131890,-1.135194,1.086963,-0.010547,0.423797,3.790880,-1.155595,...,-0.315105,0.575520,0.490842,0.756502,-0.142685,-0.602777,0.508712,-0.091646,8.555858,1
120862,120862,0.439760,0.531678,-1.108844,0.276972,0.386453,-1.038906,-0.810526,0.395582,-0.322635,...,0.000589,-0.824566,-0.174821,0.479535,-0.094335,0.698329,-0.130716,0.083227,5.094669,0
207960,207960,0.792328,1.878626,0.162765,-0.167433,3.465196,0.197332,1.157212,-0.676783,0.473890,...,-0.217428,-0.785738,0.406279,-0.056071,-0.560484,-0.388620,-0.012717,-0.038421,-0.223713,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
236229,236229,0.860700,-1.319844,0.290232,-0.223288,-0.351133,2.003048,0.004449,2.111141,-0.155835,...,0.259482,0.301030,-0.388021,-1.449786,1.720770,-0.282374,-0.106111,0.026727,2.379375,0
15810,15810,0.157716,-25.942434,14.601998,-27.368650,6.378395,-19.104033,-4.684806,-18.261393,17.052566,...,1.784316,-1.917759,-1.235787,0.161105,1.820378,-0.219359,1.388786,0.406810,1.089779,1
1569,1569,0.007107,-0.693097,0.720897,0.487926,1.545283,-0.123343,0.151906,1.821822,-0.176592,...,0.200782,0.193611,0.288196,-0.081502,0.281742,-0.136080,0.050083,0.147487,3.604136,0
107067,107067,0.406674,-1.512516,1.133139,-1.601052,2.813401,-2.664503,-0.310371,-1.520895,0.852996,...,0.729828,0.485286,0.567005,0.323586,0.040871,0.825814,0.414482,0.267265,4.137637,1


In [60]:
balanced_df_np = balanced_df.to_numpy()

In [65]:
balanced_df_np.shape

(984, 32)

In [61]:
x_train_b, y_train_b = balanced_df_np[:700, :-1], balanced_df_np[:700, -1].astype(int)
x_test_b, y_test_b = balanced_df_np[700:842, :-1], balanced_df_np[700:842, -1].astype(int)
x_val_b, y_val_b = balanced_df_np[842:, :-1], balanced_df_np[842:, -1].astype(int)

In [66]:
x_train_b.shape, x_test_b.shape, x_val_b.shape

((700, 31), (142, 31), (142, 31))

## Logistic Regression

In [None]:
logistic_model_b = LogisticRegression()
logistic_model_b.fit(x_train_b, y_train_b)
logistic_model_b.score(x_train_b, y_train_b)

0.95

In [71]:
print(classification_report(y_val_b, logistic_model_b.predict(x_val_b), target_names=["Legitimate","Fraudulent"]))

              precision    recall  f1-score   support

  Legitimate       0.93      0.97      0.95        72
  Fraudulent       0.97      0.93      0.95        70

    accuracy                           0.95       142
   macro avg       0.95      0.95      0.95       142
weighted avg       0.95      0.95      0.95       142



## Shallow Nureal Network

not sure if inputlayer should take balanced data

In [84]:
shallow_nn_b = Sequential()
shallow_nn_b.add(InputLayer((x_train_b.shape[1],)))
shallow_nn_b.add(Dense(2, 'relu'))
shallow_nn_b.add(BatchNormalization())
shallow_nn_b.add(Dense(1, 'sigmoid'))

checkpoint = ModelCheckpoint('shallow_nn_b', save_best_only=True)
shallow_nn_b.compile(optimizer='adam', loss='binary_crossentropy', metrics='accuracy')

In [85]:
shallow_nn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 2)                 64        
                                                                 
 batch_normalization_2 (Bat  (None, 2)                 8         
 chNormalization)                                                
                                                                 
 dense_5 (Dense)             (None, 1)                 3         
                                                                 
Total params: 75 (300.00 Byte)
Trainable params: 71 (284.00 Byte)
Non-trainable params: 4 (16.00 Byte)
_________________________________________________________________


In [86]:
shallow_nn_b.fit(x_train_b, y_train_b, validation_data=(x_val_b, y_val_b), epochs=40, callbacks=checkpoint)

Epoch 1/40
 1/22 [>.............................] - ETA: 18s - loss: 0.7863 - accuracy: 0.3750INFO:tensorflow:Assets written to: shallow_nn_b\assets


INFO:tensorflow:Assets written to: shallow_nn_b\assets


Epoch 2/40
 1/22 [>.............................] - ETA: 0s - loss: 0.7477 - accuracy: 0.4688INFO:tensorflow:Assets written to: shallow_nn_b\assets


INFO:tensorflow:Assets written to: shallow_nn_b\assets


Epoch 3/40
 1/22 [>.............................] - ETA: 0s - loss: 0.8145 - accuracy: 0.3750INFO:tensorflow:Assets written to: shallow_nn_b\assets


INFO:tensorflow:Assets written to: shallow_nn_b\assets


Epoch 4/40
 1/22 [>.............................] - ETA: 0s - loss: 0.7332 - accuracy: 0.4688INFO:tensorflow:Assets written to: shallow_nn_b\assets


INFO:tensorflow:Assets written to: shallow_nn_b\assets


Epoch 5/40
 1/22 [>.............................] - ETA: 0s - loss: 0.8063 - accuracy: 0.3125INFO:tensorflow:Assets written to: shallow_nn_b\assets


INFO:tensorflow:Assets written to: shallow_nn_b\assets


Epoch 6/40
 1/22 [>.............................] - ETA: 0s - loss: 0.7520 - accuracy: 0.3125INFO:tensorflow:Assets written to: shallow_nn_b\assets


INFO:tensorflow:Assets written to: shallow_nn_b\assets


Epoch 7/40
 1/22 [>.............................] - ETA: 0s - loss: 0.7895 - accuracy: 0.3125INFO:tensorflow:Assets written to: shallow_nn_b\assets


INFO:tensorflow:Assets written to: shallow_nn_b\assets


Epoch 8/40
 1/22 [>.............................] - ETA: 0s - loss: 0.7536 - accuracy: 0.3750INFO:tensorflow:Assets written to: shallow_nn_b\assets


INFO:tensorflow:Assets written to: shallow_nn_b\assets


Epoch 9/40


INFO:tensorflow:Assets written to: shallow_nn_b\assets


Epoch 10/40
 1/22 [>.............................] - ETA: 0s - loss: 0.7214 - accuracy: 0.4062INFO:tensorflow:Assets written to: shallow_nn_b\assets


INFO:tensorflow:Assets written to: shallow_nn_b\assets


Epoch 11/40
 1/22 [>.............................] - ETA: 0s - loss: 0.7600 - accuracy: 0.2812INFO:tensorflow:Assets written to: shallow_nn_b\assets


INFO:tensorflow:Assets written to: shallow_nn_b\assets


Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
 1/22 [>.............................] - ETA: 0s - loss: 0.6938 - accuracy: 0.4062INFO:tensorflow:Assets written to: shallow_nn_b\assets


INFO:tensorflow:Assets written to: shallow_nn_b\assets




<keras.src.callbacks.History at 0x28d2b5826d0>

In [87]:
print(classification_report(y_val_b, neural_net_predictions(shallow_nn_b, x_val_b), target_names=["Legitimate","Fraudulent"]))

              precision    recall  f1-score   support

  Legitimate       0.00      0.00      0.00        72
  Fraudulent       0.49      1.00      0.66        70

    accuracy                           0.49       142
   macro avg       0.25      0.50      0.33       142
weighted avg       0.24      0.49      0.33       142



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Random Forest Classifier

In [88]:
rf_b = RandomForestClassifier(max_depth=2, n_jobs=-1)
rf_b.fit(x_train_b, y_train_b)

In [89]:

print(classification_report(y_val_b, rf_b.predict(x_val_b), target_names=["Legitimate","Fraudulent"]))

              precision    recall  f1-score   support

  Legitimate       0.87      1.00      0.93        72
  Fraudulent       1.00      0.84      0.91        70

    accuracy                           0.92       142
   macro avg       0.93      0.92      0.92       142
weighted avg       0.93      0.92      0.92       142



## Gradient Boosting Classifier

In [95]:
gbc_b = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=2, random_state=0)
gbc_b.fit(x_train_b, y_train_b)
print(classification_report(y_val_b, gbc.predict(x_val_b), target_names=["Legitimate","Fraudulent"]))

              precision    recall  f1-score   support

  Legitimate       0.00      0.00      0.00        72
  Fraudulent       0.49      1.00      0.66        70

    accuracy                           0.49       142
   macro avg       0.25      0.50      0.33       142
weighted avg       0.24      0.49      0.33       142



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Linear Support Vector Machin (SVC)

In [96]:
svc_b = LinearSVC(class_weight='balanced')
svc_b.fit(x_train_b, y_train_b)

print(classification_report(y_val_b, svc_b.predict(x_val_b), target_names=["Legitimate","Fraudulent"]))

              precision    recall  f1-score   support

  Legitimate       1.00      0.06      0.11        72
  Fraudulent       0.51      1.00      0.67        70

    accuracy                           0.52       142
   macro avg       0.75      0.53      0.39       142
weighted avg       0.76      0.52      0.39       142



