In [1]:
import pandas as pd

This process notebook will investigate possible transformations to build a baseline model as start point, and from there we'll be improving the model, applying different techniques and feature engineering. 

In [2]:
# data
fraud_df = pd.read_csv("data/second-eda-output.csv")
fraud_df["day_of_month"] = fraud_df["day_of_month"].astype(str)

# Pre-processing Pipeline

- dropping columns
- filter categories: PAYMENT, DEBIT and CASH_IN. (*automatic tagged as not fraud*)
- training ans testing split
  - Is necessary that we keep the proportion of the target and also of the categorical variables
- label_encoder on type
- scale on numerical variables

In [3]:
from src.utils_preprocessing import PreProcessingPipe

In [4]:
pre_processing_pipe = PreProcessingPipe(dataset=fraud_df)
pre_processing_pipe.train_test_splitting(sample_test_size=0.40, to_drop=["is_fraud"])

In [5]:
pre_processing_pipe.one_hot_encoder(["day_of_month", "type"])
pre_processing_pipe.X_test

Unnamed: 0,amount_log,amount_dest_log,day_of_month_7,day_of_month_9,day_of_month_16,day_of_month_11,day_of_month_25,day_of_month_1,day_of_month_15,day_of_month_17,...,day_of_month_5,day_of_month_30,day_of_month_19,day_of_month_27,day_of_month_31,type_CASH_OUT,type_CASH_IN,type_PAYMENT,type_DEBIT,type_TRANSFER
6351923,12.468406,9.077967,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
4958954,12.477169,12.477169,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2565225,13.062532,13.062532,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3456717,8.666616,0.000000,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1424294,11.512606,11.979958,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5699460,6.839316,0.000000,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
5514245,10.976880,10.976880,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6082646,8.717278,8.717278,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
23668,12.406850,9.902387,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


In [6]:
pre_processing_pipe.X_test

Unnamed: 0,amount_log,amount_dest_log,day_of_month_7,day_of_month_9,day_of_month_16,day_of_month_11,day_of_month_25,day_of_month_1,day_of_month_15,day_of_month_17,...,day_of_month_5,day_of_month_30,day_of_month_19,day_of_month_27,day_of_month_31,type_CASH_OUT,type_CASH_IN,type_PAYMENT,type_DEBIT,type_TRANSFER
6351923,12.468406,9.077967,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
4958954,12.477169,12.477169,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2565225,13.062532,13.062532,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3456717,8.666616,0.000000,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1424294,11.512606,11.979958,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5699460,6.839316,0.000000,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
5514245,10.976880,10.976880,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6082646,8.717278,8.717278,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
23668,12.406850,9.902387,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0


# Training

In [7]:
from src.utils_preprocessing import Training

training_pipe = Training(
    X_train=pre_processing_pipe.X_train,
    X_test=pre_processing_pipe.X_test,
    y_train=pre_processing_pipe.y_train,
    y_test=pre_processing_pipe.y_test,
)
training_pipe.fit_logistic_regression()
training_pipe.predict_logistic_regression()
metrics = training_pipe.calculate_metrics()

metrics


  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'training': {'accuracy': (0.9991882274911907,),
  'recall': 0.45474837662337664,
  'precision': 0.8447041085563513,
  'auc': 0.727320157565571},
 'testing': {'accuracy': (0.999178797413644,),
  'recall': 0.4465753424657534,
  'precision': 0.843588269120184,
  'auc': 0.7232341650641269}}

This is a very bad model, with an auc of 0.54. Is like the same thing of you try to guess if the transaction is fraud or not. But, the ideia was just to make a first base-model, to follow the process from start to finish with the api. 

In [8]:
import pickle

In [9]:
model_file_name = "models/lrc_baseline.sav"
pickle.dump(training_pipe.lrc, open(model_file_name, "wb"))