In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
import os, sys
sys.path.append('..')

from scripts.data_utils.loaders import load_data
from scripts.modeling.data_preparation import preprocess_data, stratified_split
from scripts.modeling.model_training import train_all_models
from scripts.modeling.experiment_tracking import log_experiment

DATAPATH = os.path.join('..', 'resources', 'data')

In [3]:
# Load datasets
fraud_data_path = os.path.join(DATAPATH, 'processed', "fraud_processed.csv")
credit_card_path = os.path.join(DATAPATH, "creditcard.csv")

fraud_df = load_data(fraud_data_path)
credit_df = load_data(credit_card_path)
fraud_df.info(), credit_df.info()

2025-02-17 15:06:30 - [✅ INFO] - Loading data from ..\resources\data\processed\fraud_processed.csv
2025-02-17 15:06:31 - [✅ INFO] - Successfully loaded data from ..\resources\data\processed\fraud_processed.csv
2025-02-17 15:06:31 - [✅ INFO] - Loading data from ..\resources\data\creditcard.csv
2025-02-17 15:06:35 - [✅ INFO] - Successfully loaded data from ..\resources\data\creditcard.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   user_id                   151112 non-null  int64  
 1   signup_time               151112 non-null  object 
 2   purchase_time             151112 non-null  object 
 3   purchase_value            151112 non-null  float64
 4   device_id                 151112 non-null  object 
 5   source                    151112 non-null  int64  
 6   browser                   151112 non-null  int

(None, None)

In [13]:
fraud_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   user_id                   151112 non-null  int64  
 1   signup_time               151112 non-null  object 
 2   purchase_time             151112 non-null  object 
 3   purchase_value            151112 non-null  float64
 4   device_id                 151112 non-null  object 
 5   source                    151112 non-null  int64  
 6   browser                   151112 non-null  int64  
 7   sex                       151112 non-null  int64  
 8   age                       151112 non-null  int64  
 9   ip_address                151112 non-null  int64  
 10  class                     151112 non-null  int64  
 11  country                   151112 non-null  int64  
 12  hour_of_day               151112 non-null  float64
 13  day_of_week               151112 non-null  f

(None,
 Index(['user_id', 'signup_time', 'purchase_time', 'purchase_value',
        'device_id', 'source', 'browser', 'sex', 'age', 'ip_address', 'class',
        'country', 'hour_of_day', 'day_of_week', 'time_diff',
        'transaction_velocity', 'signup_to_purchase_hours',
        'signup_delay_bucket'],
       dtype='object'))

In [21]:
fraud_target = 'class'
drop_columns = ['user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address']
X_fraud, y_fraud = preprocess_data(fraud_df, fraud_target, drop_columns)
X_fraud, y_fraud

(        purchase_value  source  browser  sex  age  country  hour_of_day  \
 0             0.310345       2        0    0   25      171     0.434783   
 1             0.220690       1        4    0   38      157     0.913043   
 2             0.262069       2        0    1   25      171     0.478261   
 3             0.365517       2        1    1   21      172     1.000000   
 4             0.179310       0        4    1   19      107     0.869565   
 ...                ...     ...      ...  ...  ...      ...          ...   
 151107        0.503448       1        4    1   59       60     0.304348   
 151108        0.075862       1        2    0   37       64     0.304348   
 151109        0.165517       1        1    0   35       84     1.000000   
 151110        0.393103       1        0    1   48      171     0.869565   
 151111        0.337931       0        0    1   25      171     0.260870   
 
         day_of_week  time_diff  transaction_velocity  \
 0          0.833333        0

In [23]:
credit_target = 'Class'
X_credit, y_credit = preprocess_data(credit_df, credit_target)
X_credit, y_credit

(            Time         V1         V2        V3        V4        V5  \
 0            0.0  -1.359807  -0.072781  2.536347  1.378155 -0.338321   
 1            0.0   1.191857   0.266151  0.166480  0.448154  0.060018   
 2            1.0  -1.358354  -1.340163  1.773209  0.379780 -0.503198   
 3            1.0  -0.966272  -0.185226  1.792993 -0.863291 -0.010309   
 4            2.0  -1.158233   0.877737  1.548718  0.403034 -0.407193   
 ...          ...        ...        ...       ...       ...       ...   
 284802  172786.0 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
 284803  172787.0  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
 284804  172788.0   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
 284805  172788.0  -0.240440   0.530483  0.702510  0.689799 -0.377961   
 284806  172792.0  -0.533413  -0.189733  0.703337 -0.506271 -0.012546   
 
               V6        V7        V8        V9  ...       V20       V21  \
 0       0.462388  0.239599  0.098698  0.36378

In [24]:
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = stratified_split(X_fraud, y_fraud)


In [25]:
X_credit_train, X_credit_test, y_credit_train, y_credit_test = stratified_split(X_credit, y_credit)


In [26]:
models = train_all_models(X_fraud_train, y_fraud_train, X_fraud_test, y_fraud_test)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



=== LogisticRegression ===
              precision    recall  f1-score   support

           0       0.91      0.98      0.94     27393
           1       0.03      0.00      0.01      2830

    accuracy                           0.89     30223
   macro avg       0.47      0.49      0.48     30223
weighted avg       0.82      0.89      0.86     30223

AUC-ROC: 0.7514

=== DecisionTree ===
              precision    recall  f1-score   support

           0       0.95      0.94      0.95     27393
           1       0.50      0.57      0.53      2830

    accuracy                           0.91     30223
   macro avg       0.73      0.76      0.74     30223
weighted avg       0.91      0.91      0.91     30223

AUC-ROC: 0.7553

=== RandomForest ===
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27393
           1       1.00      0.54      0.70      2830

    accuracy                           0.96     30223
   macro avg       0.98 

In [27]:
for name, model in models.items():
    log_experiment(model, name, X_fraud_test, y_fraud_test)




Logged LogisticRegression to MLflow.




Logged DecisionTree to MLflow.




Logged RandomForest to MLflow.




Logged GradientBoosting to MLflow.
