# Initial models and evaluation

In [49]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, Dense, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.svm import LinearSVC

In [2]:
data = pd.read_csv("../data/processed/standardized_creditcard.csv")

In [16]:
train, test, val = data[:240000], data[240000:262000], data[262000:]

In [17]:
x_train, y_train = train.to_numpy()[:, :-1], train.to_numpy()[:, -1]
x_test, y_test = test.to_numpy()[:, :-1], test.to_numpy()[:, -1]
x_val, y_val = val.to_numpy()[:, :-1], val.to_numpy()[:, -1]

In [19]:
val.shape

(22807, 32)

In [20]:
logistic_model = LogisticRegression()
logistic_model.fit(x_train, y_train)
logistic_model.score(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9988875

In [24]:
print(classification_report(y_val, logistic_model.predict(x_val), target_names=["Legitimate","Fraudulent"]))

              precision    recall  f1-score   support

  Legitimate       1.00      1.00      1.00     22789
  Fraudulent       0.80      0.22      0.35        18

    accuracy                           1.00     22807
   macro avg       0.90      0.61      0.67     22807
weighted avg       1.00      1.00      1.00     22807



In [36]:
shallow_nn = Sequential()
shallow_nn.add(InputLayer((x_train.shape[1],)))
shallow_nn.add(Dense(2, 'relu'))
shallow_nn.add(BatchNormalization())
shallow_nn.add(Dense(1, 'sigmoid'))

checkpoint = ModelCheckpoint('shallow_nn', save_best_only=True)
shallow_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics='accuracy')

In [37]:
shallow_nn.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 2)                 64        
                                                                 
 batch_normalization_2 (Bat  (None, 2)                 8         
 chNormalization)                                                
                                                                 
 dense_5 (Dense)             (None, 1)                 3         
                                                                 
Total params: 75 (300.00 Byte)
Trainable params: 71 (284.00 Byte)
Non-trainable params: 4 (16.00 Byte)
_________________________________________________________________


In [38]:
shallow_nn.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=5, callbacks=checkpoint)

Epoch 1/5


INFO:tensorflow:Assets written to: shallow_nn\assets


Epoch 2/5


INFO:tensorflow:Assets written to: shallow_nn\assets


Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x28d7c78d310>

In [39]:
def neural_net_predictions(model, x):
    return (model.predict(x).flatten() > 0.5).astype(int)
neural_net_predictions(shallow_nn, x_val)




array([0, 0, 0, ..., 0, 0, 0])

In [40]:
print(classification_report(y_val, neural_net_predictions(shallow_nn, x_val), target_names=["Legitimate","Fraudulent"]))

              precision    recall  f1-score   support

  Legitimate       1.00      1.00      1.00     21963
  Fraudulent       0.00      0.00      0.00        37

    accuracy                           1.00     22000
   macro avg       0.50      0.50      0.50     22000
weighted avg       1.00      1.00      1.00     22000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Random Forest Classifier

In [44]:
rf = RandomForestClassifier(max_depth=2, n_jobs=-1)
rf.fit(x_train, y_train)

In [46]:

print(classification_report(y_val, rf.predict(x_val), target_names=["Legitimate","Fraudulent"]))

              precision    recall  f1-score   support

  Legitimate       1.00      1.00      1.00     22789
  Fraudulent       0.89      0.44      0.59        18

    accuracy                           1.00     22807
   macro avg       0.94      0.72      0.80     22807
weighted avg       1.00      1.00      1.00     22807



## Gradient Boosting Classifier

In [51]:
gbc = GradientBoostingClassifier(n_estimators=50, learning_rate=10, max_depth=1, random_state=0)
gbc.fit(x_train, y_train)
print(classification_report(y_test, gbc.predict(x_test), target_names=["Legitimate","Fraudulent"]))

              precision    recall  f1-score   support

  Legitimate       0.00      0.00      0.00     21963
  Fraudulent       0.00      1.00      0.00        37

    accuracy                           0.00     22000
   macro avg       0.00      0.50      0.00     22000
weighted avg       0.00      0.00      0.00     22000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Linear Support Vector Machin (SVC)

In [52]:
svc = LinearSVC(class_weight='balanced')
svc.fit(x_train, y_train)

print(classification_report(y_test, svc.predict(x_test), target_names=["Legitimate","Fraudulent"]))



              precision    recall  f1-score   support

  Legitimate       1.00      1.00      1.00     21963
  Fraudulent       0.00      0.00      0.00        37

    accuracy                           1.00     22000
   macro avg       0.50      0.50      0.50     22000
weighted avg       1.00      1.00      1.00     22000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [53]:
legits = data.query('Class == 0')
frauds = data.query('Class == 1')

In [54]:
legits['Class'].value_counts(), frauds['Class'].value_counts()

(Unnamed: 0  Time      V1         V2         V3         V4         V5         V6         V7         V8         V9         V10        V11        V12        V13        V14        V15        V16        V17        V18        V19        V20        V21        V22        V23        V24        V25        V26        V27        V28        Amount     Class
 0           0.000000  -1.359807  -0.072781   2.536347   1.378155  -0.338321   0.462388   0.239599   0.098698   0.363787   0.090794  -0.551600  -0.617801  -0.991390  -0.311169   1.468177  -0.470401   0.207971   0.025791   0.403993   0.251412  -0.018307   0.277838  -0.110474   0.066928   0.128539  -0.189115   0.133558  -0.021053   1.783274  0        1
 189912      0.744300  -1.237538   3.379870  -1.737462   4.530006   0.517640   0.027493   0.069171   0.767088  -1.280213   2.300581   1.159267  -0.078014   0.276016  -2.334891  -0.057621   0.973769   2.137541   1.358306   1.016821   0.801526  -0.475329  -0.979114   0.257669  -0.651071  -0.431011   