Import Necessary Modules

In [23]:
import sys, os
sys.path.append(os.path.abspath('..'))
import pandas as pd
import numpy as np

from src.feature_engineering import feature_engineering, split_data, baseline_model, ensemble_model, model_evaluator
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score



## Fraud Data

Preparing Data

In [24]:
fraud = pd.read_csv('../data/processed/fraud_processed.csv')
fraud.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 129146 entries, 0 to 129145
Data columns (total 24 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   user_id                 129146 non-null  float64
 1   signup_time             129146 non-null  object 
 2   purchase_time           129146 non-null  object 
 3   purchase_value          129146 non-null  float64
 4   device_id               129146 non-null  object 
 5   age                     129146 non-null  float64
 6   ip_address              129146 non-null  float64
 7   class                   129146 non-null  int64  
 8   lower_bound_ip_address  129146 non-null  float64
 9   upper_bound_ip_address  129146 non-null  float64
 10  country                 129146 non-null  object 
 11  time_since_signup       129146 non-null  float64
 12  hour_of_day             129146 non-null  int64  
 13  day_of_week             129146 non-null  int64  
 14  txn_count_1h        

In [26]:
X_fraud, y_fraud = feature_engineering(fraud, 'class')
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = split_data(X_fraud, y_fraud)


Building a Baseline model using Logistic Regression

In [27]:
log_reg_fraud = baseline_model(X_train_fraud, y_train_fraud)




In [28]:
model_evaluator(log_reg_fraud, X_test_fraud, y_test_fraud) # reviewing metrics of logistic regression model

Model Name:  LogisticRegression
F1-score:  0.28102630097321646
AUC-PR:  0.411683695578497
Confusion Matric: 
 [[15144  8232]
 [  707  1747]]


Building Ensemble Model

In [29]:
rf_fraud = ensemble_model(RandomForestClassifier, X_train_fraud, y_train_fraud)

In [30]:
model_evaluator(rf_fraud, X_test_fraud, y_test_fraud) # evaluating the model

Model Name:  RandomForestClassifier
F1-score:  0.6232827187274042
AUC-PR:  0.7190105912021924
Confusion Matric: 
 [[22022  1354]
 [  730  1724]]


Cross Validation

In [31]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_pr_cv = cross_val_score(
    rf_fraud,
    X_fraud,
    y_fraud,
    cv=skf,
    scoring='average_precision',
    n_jobs=-1
    )
print("Mean AUC-PR:", auc_pr_cv.mean())
print("Std AUC-PR:", auc_pr_cv.std())

Mean AUC-PR: 0.711155890390075
Std AUC-PR: 0.005682828041693299


## Credit Card Data

Data Preparation

In [32]:
credit = pd.read_csv('../data/raw/creditcard.csv')
# Run feature engineering and capture returned splits (keep duplicate names used later)
X_credit, y_credit = feature_engineering(credit, 'Class')
X_train_credit, X_test_credit, y_train_credit, y_test_credit = split_data(X_credit, y_credit)


Building a Baseline model using Logistic Regression

In [34]:
log_reg_credit = baseline_model(X_train_credit, y_train_credit)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
model_evaluator(log_reg_credit, X_test_credit, y_test_credit) # evaluating the model

Model Name:  LogisticRegression
F1-score:  0.10416666666666667
AUC-PR:  0.7124702274505191
Confusion Matric: 
 [[55324  1540]
 [    8    90]]


Building Ensemble Model

In [35]:
rf_credit = ensemble_model(RandomForestClassifier, X_train_credit, y_train_credit)

In [36]:
model_evaluator(rf_credit, X_test_credit, y_test_credit) # evaluating random forest model

Model Name:  RandomForestClassifier
F1-score:  0.8121827411167513
AUC-PR:  0.8204176082674941
Confusion Matric: 
 [[56845    19]
 [   18    80]]


Cross Validation

In [37]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
auc_pr_cv = cross_val_score(
    rf_credit,
    X_credit,
    y_credit,
    cv=skf,
    scoring='average_precision',
    n_jobs=-1
    )
print("Mean AUC-PR:", auc_pr_cv.mean())
print("Std AUC-PR:", auc_pr_cv.std())

Mean AUC-PR: 0.823651925601687
Std AUC-PR: 0.027092036325574484


Model Comparison and Selection

Based on the model evaluation done on the two models (Logistic Regression and Random Forest), the best model is Random Forest. This is because it has a higher AUC-PR showing a better overall performance in balancing precision and recall. From the confusion matrix, it can also be seen that there are fewer false positives. It also indicates that Random Forest model better captures complex patterns as compared to the Logistic Regression model.