In [1]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
from pathlib import Path
from src.data import data_handle
from src.features.features import FraudFeatureEgineering
from src.models.fraud_mode import *
from src.utils import split_train_test_val_set

In [3]:
ROOT_DIR = Path.cwd().parents[0]
data_path = os.path.join(ROOT_DIR, 'data/raw/credit_card.parquet')

In [4]:
data = data_handle.load_data(data_path)

In [5]:
train_set, test_set, val_set = split_train_test_val_set(df=data, 
                                                        train_size=0.8,
                                                        val_size=0.5)

In [6]:
print(train_set.shape, test_set.shape, val_set.shape)

(227984, 35) (28457, 35) (28366, 35)


In [7]:
data_processor = FraudFeatureEgineering(data)

In [8]:
process_data = data_processor.get_fraud_features()

In [9]:
data_process_path = os.path.join(ROOT_DIR, 'data/processed/process_data.parquet')
data_handle.save_data(data_process_path, process_data)

In [10]:
from src.visualization import plots

In [11]:
plots.get_eda_html(file_name='raw_data_eda.html', data=data)

                                             |          | [  0%]   00:00 -> (? left)

Report /Users/kimalejandromoratrujillo/Documents/code/challenge_conekta/fraud_model/reports/raw_data_eda.html was generated.


In [12]:
plots.get_eda_html(file_name='process_data_eda.html', data=process_data)

                                             |          | [  0%]   00:00 -> (? left)

Report /Users/kimalejandromoratrujillo/Documents/code/challenge_conekta/fraud_model/reports/process_data_eda.html was generated.


In [11]:
train_set, test_set, val_set = split_train_test_val_set(df=process_data,
                                                        train_size=0.8,
                                                        val_size=0.5)

In [16]:
train_path = os.path.join(ROOT_DIR, 'data/processed/train_set.parquet')
test_path = os.path.join(ROOT_DIR, 'data/processed/test_set.parquet')
val_path = os.path.join(ROOT_DIR, 'data/processed/validation_set.parquet')

data_handle.save_data(train_path, train_set)
data_handle.save_data(test_path, test_set)
data_handle.save_data(val_path, val_set)

In [12]:
create_tuned_model(train_set=train_set,
                   model_type='lr')

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Class
2,Target type,Binary
3,Original data shape,"(227984, 47)"
4,Transformed data shape,"(386988, 47)"
5,Transformed train set shape,"(318592, 47)"
6,Transformed test set shape,"(68396, 47)"
7,Numeric features,38
8,Categorical features,2
9,Rows with missing values,73.5%


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9984,0.9946,0.9655,0.5283,0.6829,0.6822,0.7136
1,0.9985,0.9981,0.9655,0.549,0.7,0.6993,0.7275
2,0.9985,0.9913,0.8966,0.5532,0.6842,0.6835,0.7036
3,0.9985,0.9648,0.8966,0.5532,0.6842,0.6835,0.7036
4,0.998,0.9774,0.9655,0.4746,0.6364,0.6355,0.6762
5,0.9984,0.9866,0.8966,0.5306,0.6667,0.6659,0.689
6,0.9989,0.9971,0.9,0.6429,0.75,0.7495,0.7601
7,0.9982,0.9925,0.9333,0.5185,0.6667,0.6659,0.695
8,0.9976,0.989,0.9655,0.4242,0.5895,0.5884,0.6392
9,0.9984,0.9929,0.931,0.5294,0.675,0.6742,0.7014


Transformation Pipeline and Model Successfully Saved


In [13]:
create_tuned_model(train_set=train_set,
                   model_type='catboost')

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Class
2,Target type,Binary
3,Original data shape,"(227984, 47)"
4,Transformed data shape,"(386988, 47)"
5,Transformed train set shape,"(318592, 47)"
6,Transformed test set shape,"(68396, 47)"
7,Numeric features,38
8,Categorical features,2
9,Rows with missing values,73.5%


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9998,0.9998,0.9655,0.9333,0.9492,0.9491,0.9492
1,0.9999,1.0,0.9655,1.0,0.9825,0.9824,0.9826
2,0.9997,0.9999,0.931,0.9,0.9153,0.9151,0.9152
3,0.9996,0.9773,0.931,0.871,0.9,0.8998,0.9003
4,0.9996,0.9935,0.8966,0.8667,0.8814,0.8811,0.8813
5,0.9996,0.9829,0.8966,0.8966,0.8966,0.8964,0.8964
6,0.9996,0.9872,0.8667,0.9286,0.8966,0.8964,0.8969
7,0.9998,0.9969,0.9667,0.9355,0.9508,0.9507,0.9509
8,0.9998,1.0,1.0,0.9062,0.9508,0.9507,0.9519
9,0.9998,0.98,0.8966,1.0,0.9455,0.9454,0.9468


Transformation Pipeline and Model Successfully Saved


In [14]:
create_tuned_model(train_set=train_set,
                   model_type='lightgbm')

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Class
2,Target type,Binary
3,Original data shape,"(227984, 47)"
4,Transformed data shape,"(386988, 47)"
5,Transformed train set shape,"(318592, 47)"
6,Transformed test set shape,"(68396, 47)"
7,Numeric features,38
8,Categorical features,2
9,Rows with missing values,73.5%


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9997,0.9976,0.8966,0.9286,0.9123,0.9121,0.9123
1,0.9997,0.9999,0.931,0.931,0.931,0.9309,0.9309
2,0.9996,0.9996,0.8966,0.8966,0.8966,0.8964,0.8964
3,0.9997,0.9961,0.931,0.9,0.9153,0.9151,0.9152
4,0.9997,0.9781,0.8966,0.9286,0.9123,0.9121,0.9123
5,0.9994,0.9766,0.8276,0.8571,0.8421,0.8418,0.842
6,0.9997,0.9918,0.8333,1.0,0.9091,0.9089,0.9127
7,0.9997,0.9907,0.9667,0.9062,0.9355,0.9354,0.9358
8,0.9998,1.0,0.931,0.9643,0.9474,0.9473,0.9474
9,0.9997,0.9587,0.8276,1.0,0.9057,0.9055,0.9096


Transformation Pipeline and Model Successfully Saved


In [15]:
create_tuned_model(train_set=train_set,
                   model_type='xgboost')

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Class
2,Target type,Binary
3,Original data shape,"(227984, 47)"
4,Transformed data shape,"(386988, 47)"
5,Transformed train set shape,"(318592, 47)"
6,Transformed test set shape,"(68396, 47)"
7,Numeric features,38
8,Categorical features,2
9,Rows with missing values,73.5%


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9997,0.9997,0.8966,0.9286,0.9123,0.9121,0.9123
1,0.9999,0.9999,0.931,1.0,0.9643,0.9642,0.9648
2,0.9996,0.9996,0.8966,0.8966,0.8966,0.8964,0.8964
3,0.9997,0.9987,0.9655,0.9032,0.9333,0.9332,0.9337
4,0.9996,0.9852,0.8966,0.8966,0.8966,0.8964,0.8964
5,0.9997,0.9732,0.8621,0.9615,0.9091,0.9089,0.9103
6,0.9997,0.9791,0.8333,1.0,0.9091,0.9089,0.9127
7,0.9999,0.9967,0.9667,0.9667,0.9667,0.9666,0.9666
8,0.9997,1.0,0.8966,0.963,0.9286,0.9284,0.929
9,0.9997,0.9955,0.8621,1.0,0.9259,0.9258,0.9284


Transformation Pipeline and Model Successfully Saved


In [18]:
lr_model = load_model('lr')
xgb_model = load_model('xgboost')
cat_model = load_model('catboost')
lgbm_model = load_model('lightgbm')

Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded
Transformation Pipeline and Model Successfully Loaded


In [19]:
lr_model

In [33]:
process_data['score'] = xgb_model.predict_proba(process_data.drop(['Class', 'timestamp', 'credit_card_number'], axis=1))[:,1]