In [2]:
import pandas as pd

This process notebook will investigate possible transformations to build a baseline model as start point, and from there we'll be improving the model, applying different techniques and feature engineering. 

In [3]:
# data
fraud_df = pd.read_csv("data/fraud_detection_dataset.csv")
fraud_df

Unnamed: 0,day_of_month,type,amount_log,amount_dest_log,is_fraud
0,1,PAYMENT,9.194276,0.000000,0
1,1,PAYMENT,7.531166,0.000000,0
2,1,TRANSFER,5.204007,0.000000,1
3,1,CASH_OUT,5.204007,9.960954,1
4,1,PAYMENT,9.364703,0.000000,0
...,...,...,...,...,...
6362615,31,CASH_OUT,12.735768,12.735768,1
6362616,31,TRANSFER,15.657870,0.000000,1
6362617,31,CASH_OUT,15.657870,15.657870,1
6362618,31,TRANSFER,13.652996,0.000000,1


# Pre-processing Pipeline

- dropping columns
- filter categories: PAYMENT, DEBIT and CASH_IN. (*automatic tagged as not fraud*)
- training ans testing split
  - Is necessary that we keep the proportion of the target and also of the categorical variables
- label_encoder on type
- scale on numerical variables

In [3]:
from src.utils_preprocessing import PreProcessingPipe

In [4]:
pre_processing_pipe = PreProcessingPipe(dataset=fraud_df)
pre_processing_pipe.drop_columns(columns=["isFlaggedFraud", "step", "nameOrig", "nameDest"])
pre_processing_pipe.filter_type_classes(classes=["PAYMENT", "CASH_IN", "DEBIT"])
pre_processing_pipe.train_test_splitting(sample_test_size=0.40)

In [5]:
pre_processing_pipe.label_encoding()
pre_processing_pipe.X_train

['CASH_OUT', 'TRANSFER']


Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest
4421351,1,3186916.14,383749.41,0.00,6082045.90,9268962.03
1677808,1,745013.00,0.00,0.00,4038650.74,4783663.74
454815,1,320697.68,102278.00,0.00,0.00,320697.68
694230,0,378854.34,51582.00,0.00,65230.97,444085.31
113562,0,367970.23,53708.00,0.00,258958.75,703945.69
...,...,...,...,...,...,...
6332630,0,3453.76,47105.00,43651.24,0.00,3453.76
1102634,0,391481.34,0.00,0.00,2490303.90,2881785.24
5698907,0,261398.34,9981.00,0.00,9771799.74,10033198.08
441011,0,104153.26,0.00,0.00,679988.08,784141.34


In [6]:
pre_processing_pipe.scaling()

In [7]:
pre_processing_pipe.X_train

Unnamed: 0,type,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest
0,1.0,0.043169,0.006440,0.00000,0.017084,0.026023
1,1.0,0.010092,0.000000,0.00000,0.011344,0.013430
2,1.0,0.004344,0.001717,0.00000,0.000000,0.000900
3,0.0,0.005132,0.000866,0.00000,0.000183,0.001247
4,0.0,0.004984,0.000901,0.00000,0.000727,0.001976
...,...,...,...,...,...,...
1662240,0.0,0.000047,0.000791,0.00088,0.000000,0.000010
1662241,0.0,0.005303,0.000000,0.00000,0.006995,0.008091
1662242,0.0,0.003541,0.000168,0.00000,0.027448,0.028169
1662243,0.0,0.001411,0.000000,0.00000,0.001910,0.002202


# Training

In [8]:
from src.utils_preprocessing import Training

training_pipe = Training(
    X_train=pre_processing_pipe.X_train,
    X_test=pre_processing_pipe.X_test,
    y_train=pre_processing_pipe.y_train,
    y_test=pre_processing_pipe.y_test,
)
training_pipe.fit_logistic_regression()
training_pipe.predict_logistic_regression()
metrics = training_pipe.calculate_metrics()

metrics


  y = column_or_1d(y, warn=True)


{'training': {'accuracy': (0.9972657460242023,),
  'recall': 0.08157467532467533,
  'precision': 0.9548693586698337,
  'auc': 0.5407816055060876},
 'testing': {'accuracy': (0.9972720644236773,),
  'recall': 0.08280060882800609,
  'precision': 0.9645390070921985,
  'auc': 0.5413957790315855}}

This is a very bad model, with an auc of 0.54. Is like the same thing of you try to guess if the transaction is fraud or not. But, the ideia was just to make a first base-model, to follow the process from start to finish with the api. 

In [9]:
import pickle

In [11]:
model_file_name = "models/lrc_baseline.sav"
pickle.dump(training_pipe.lrc, open(model_file_name, "wb"))

In [10]:
!mkdir models