In [1]:
%load_ext autoreload

%autoreload 2

In [2]:
import os, sys
sys.path.append('..')

from scripts.modeling.mlflow_config import *
from scripts.modeling.model_factory import *
from scripts.modeling.model_trainer import *
from scripts.modeling.data_preparation import *
from scripts.data_utils.loaders import load_data

DATAPATH = os.path.join('..', 'resources', 'data')

In [3]:
# Load datasets
fraud_data_path = os.path.join(DATAPATH, 'processed', "fraud_processed.csv")
credit_card_path = os.path.join(DATAPATH, "creditcard.csv")

fraud_df = load_data(fraud_data_path)
credit_df = load_data(credit_card_path)
fraud_df.info(), credit_df.info()

2025-02-19 11:38:35 - [✅ INFO] - Loading data from ..\resources\data\processed\fraud_processed.csv
2025-02-19 11:38:36 - [✅ INFO] - Successfully loaded data from ..\resources\data\processed\fraud_processed.csv
2025-02-19 11:38:36 - [✅ INFO] - Loading data from ..\resources\data\creditcard.csv
2025-02-19 11:38:40 - [✅ INFO] - Successfully loaded data from ..\resources\data\creditcard.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   user_id                   151112 non-null  int64  
 1   signup_time               151112 non-null  object 
 2   purchase_time             151112 non-null  object 
 3   purchase_value            151112 non-null  float64
 4   device_id                 151112 non-null  object 
 5   source                    151112 non-null  int64  
 6   browser                   151112 non-null  int

(None, None)

In [4]:
fraud_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 18 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   user_id                   151112 non-null  int64  
 1   signup_time               151112 non-null  object 
 2   purchase_time             151112 non-null  object 
 3   purchase_value            151112 non-null  float64
 4   device_id                 151112 non-null  object 
 5   source                    151112 non-null  int64  
 6   browser                   151112 non-null  int64  
 7   sex                       151112 non-null  int64  
 8   age                       151112 non-null  int64  
 9   ip_address                151112 non-null  int64  
 10  class                     151112 non-null  int64  
 11  country                   151112 non-null  int64  
 12  hour_of_day               151112 non-null  float64
 13  day_of_week               151112 non-null  f

In [5]:
fraud_target = 'class'
drop_columns = ['user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address']
X_fraud, y_fraud = preprocess_data(fraud_df, fraud_target, drop_columns)
X_fraud, y_fraud

(        purchase_value  source  browser  sex  age  country  hour_of_day  \
 0             0.310345       2        0    0   25      171     0.434783   
 1             0.220690       1        4    0   38      157     0.913043   
 2             0.262069       2        0    1   25      171     0.478261   
 3             0.365517       2        1    1   21      172     1.000000   
 4             0.179310       0        4    1   19      107     0.869565   
 ...                ...     ...      ...  ...  ...      ...          ...   
 151107        0.503448       1        4    1   59       60     0.304348   
 151108        0.075862       1        2    0   37       64     0.304348   
 151109        0.165517       1        1    0   35       84     1.000000   
 151110        0.393103       1        0    1   48      171     0.869565   
 151111        0.337931       0        0    1   25      171     0.260870   
 
         day_of_week  time_diff  transaction_velocity  \
 0          0.833333        0

In [6]:
credit_target = 'Class'
X_credit, y_credit = preprocess_data(credit_df, credit_target)
X_credit, y_credit

(            Time         V1         V2        V3        V4        V5  \
 0            0.0  -1.359807  -0.072781  2.536347  1.378155 -0.338321   
 1            0.0   1.191857   0.266151  0.166480  0.448154  0.060018   
 2            1.0  -1.358354  -1.340163  1.773209  0.379780 -0.503198   
 3            1.0  -0.966272  -0.185226  1.792993 -0.863291 -0.010309   
 4            2.0  -1.158233   0.877737  1.548718  0.403034 -0.407193   
 ...          ...        ...        ...       ...       ...       ...   
 284802  172786.0 -11.881118  10.071785 -9.834783 -2.066656 -5.364473   
 284803  172787.0  -0.732789  -0.055080  2.035030 -0.738589  0.868229   
 284804  172788.0   1.919565  -0.301254 -3.249640 -0.557828  2.630515   
 284805  172788.0  -0.240440   0.530483  0.702510  0.689799 -0.377961   
 284806  172792.0  -0.533413  -0.189733  0.703337 -0.506271 -0.012546   
 
               V6        V7        V8        V9  ...       V20       V21  \
 0       0.462388  0.239599  0.098698  0.36378

In [7]:
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = stratified_split(X_fraud, y_fraud)


In [8]:
X_credit_train, X_credit_test, y_credit_train, y_credit_test = stratified_split(X_credit, y_credit)


In [9]:
configure_mlflow()

2025-02-19 11:38:47 - [✅ INFO] - Context impl SQLiteImpl.
2025-02-19 11:38:47 - [✅ INFO] - Will assume non-transactional DDL.


In [10]:
fraud_trainer = ModelTrainer(X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test, 'fraud')
fraud_trainer

<scripts.modeling.model_trainer.ModelTrainer at 0x22d9702ac00>

In [12]:
# Traditional ML models
fraud_models = [
    ('LogisticRegression', ModelFactory.logistic_regression()),
    ('DecisionTree', DecisionTreeClassifier(class_weight='balanced')),
    ('RandomForest', ModelFactory.random_forest()),
    ('GradientBoosting', GradientBoostingClassifier(n_estimators=100))
]

for name, model in fraud_models:
    fraud_trainer.train_sklearn_model(model, name)




FileNotFoundError: [WinError 3] The system cannot find the path specified

In [13]:
# Neural Networks
fraud_nn_models = {
    'MLP': ModelFactory.mlp,
    'CNN': ModelFactory.cnn,
    'LSTM': ModelFactory.lstm
}

for name, builder in fraud_nn_models.items():
    fraud_trainer.train_keras_model(builder, name, epochs=5)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 7ms/step - accuracy: 0.5560 - loss: 0.8117 - val_accuracy: 0.9091 - val_loss: 0.4792
Epoch 2/5
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.6912 - loss: 0.6313 - val_accuracy: 0.6063 - val_loss: 0.6708
Epoch 3/5
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.7170 - loss: 0.6086 - val_accuracy: 0.3943 - val_loss: 0.8978
Epoch 4/5
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7072 - loss: 0.6113 - val_accuracy: 0.7190 - val_loss: 0.5944
Epoch 5/5
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.7400 - loss: 0.5952 - val_accuracy: 0.7148 - val_loss: 0.5909
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step




Epoch 1/5
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 9ms/step - accuracy: 0.6040 - loss: 0.7094 - val_accuracy: 0.7469 - val_loss: 0.5900
Epoch 2/5
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.7590 - loss: 0.5719 - val_accuracy: 0.9015 - val_loss: 0.3487
Epoch 3/5
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.7966 - loss: 0.5559 - val_accuracy: 0.8829 - val_loss: 0.4889
Epoch 4/5
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - accuracy: 0.8331 - loss: 0.5304 - val_accuracy: 0.6580 - val_loss: 0.6422
Epoch 5/5
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 9ms/step - accuracy: 0.8271 - loss: 0.5337 - val_accuracy: 0.9248 - val_loss: 0.3394
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step




New best model saved: CNN with ROC-AUC: 0.7612


FileNotFoundError: [WinError 3] The system cannot find the path specified

In [14]:
# Best model info
fraud_trainer.best_model_name, fraud_trainer.best_score

('CNN', 0.7612486760242454)

In [15]:
credit_trainer = ModelTrainer(X_credit_train, X_credit_test, y_credit_train, y_credit_test, 'credit')
credit_trainer

<scripts.modeling.model_trainer.ModelTrainer at 0x22daf511f70>

In [16]:
# Traditional ML models
credit_models = [
    ('LogisticRegression', ModelFactory.logistic_regression()),
    ('DecisionTree', DecisionTreeClassifier(class_weight='balanced')),
    ('RandomForest', ModelFactory.random_forest()),
    ('GradientBoosting', GradientBoostingClassifier(n_estimators=100))
]

for name, model in credit_models:
    credit_trainer.train_sklearn_model(model, name)
    

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


New best model saved: LogisticRegression with ROC-AUC: 0.9731


FileNotFoundError: [WinError 3] The system cannot find the path specified

In [None]:
# Neural Networks
credit_nn_models = {
    'MLP': ModelFactory.mlp,
    'CNN': ModelFactory.cnn,
    'LSTM': ModelFactory.lstm
}

for name, builder in credit_nn_models.items():
    credit_trainer.train_keras_model(builder, name, epochs=30)

In [None]:
# Best model info
credit_trainer.best_model_name, credit_trainer.best_score

Expainability

In [None]:
import joblib
import pandas as pd
from scripts.explainability.shap_explainer import SHAPExplainer
from scripts.explainability.lime_explainer import LIMEExplainer
from scripts.explainability.model_explainability import ModelExplainer

model_path = "models/best_model/best_model.pkl"
explainer = ModelExplainer(model=model_path, X_train=X_fraud_train, model_type="sklearn")# or "keras" if using neural network
# Generate full report
explainer.generate_report(X_fraud_test)


NameError: name 'ModelExplainer' is not defined

In [20]:
# Load trained model (replace with your actual model path)
model = joblib.load(model_path)

In [21]:
### SHAP Analysis ###
shap_explainer = SHAPExplainer(model, X_fraud_train)

In [22]:
# Plot SHAP Summary (Global Feature Importance)
shap_explainer.plot_summary()

Provided model function fails when applied to the provided data set.


ValueError: X has 12 features, but LogisticRegression is expecting 30 features as input.

In [23]:
# Plot SHAP Force Plot (Instance-Level Explanation)
shap_explainer.plot_force(instance_idx=5)

Provided model function fails when applied to the provided data set.


ValueError: X has 12 features, but LogisticRegression is expecting 30 features as input.

In [None]:
# Plot SHAP Dependence Plot for a specific feature
shap_explainer.plot_dependence("purchase_value")

In [None]:
### LIME Analysis ###
lime_explainer = LIMEExplainer(model, X_fraud_train)

In [None]:
# Explain a single instance
lime_explainer.plot_instance_explanation(instance_idx=10)