In [1]:
%load_ext autoreload

%autoreload 2

In [None]:
import os
import sys
import matplotlib
sys.path.append('..')

from scripts.data_utils.loaders import load_data, save_data
from scripts.data_utils.cleaner import validate_convert_date_column
from scripts.modeling.model import prepare_for_modelling, CreditScoringModel

matplotlib.use('TkAgg')

In [3]:
%matplotlib inline
RESOURCEPATH = os.path.join('..', 'resources')
DATAPATH = os.path.join(RESOURCEPATH, 'data')
processed_output_dir = os.path.join(DATAPATH, 'processed')

In [4]:
date_column = 'TransactionStartTime'
filename = 'data_rfms_classified.csv'
file_path = os.path.join(processed_output_dir, filename)

data = load_data(file_path)
data = validate_convert_date_column(data, date_column)
data.info()

2025-01-29 07:37:47 - INFO - Loading data from ..\resources\data\processed\data_rfms_classified.csv
2025-01-29 07:37:47 - INFO - Loading data from ..\resources\data\processed\data_rfms_classified.csv
2025-01-29 07:37:47 - INFO - Loading data from ..\resources\data\processed\data_rfms_classified.csv
2025-01-29 07:37:47 - INFO - Loading data from ..\resources\data\processed\data_rfms_classified.csv
2025-01-29 07:37:47 - INFO - Loading data from ..\resources\data\processed\data_rfms_classified.csv
2025-01-29 07:37:47 - INFO - Loading data from ..\resources\data\processed\data_rfms_classified.csv
2025-01-29 07:37:47 - INFO - Loading data from ..\resources\data\processed\data_rfms_classified.csv
2025-01-29 07:37:50 - INFO - Successfully loaded data from ..\resources\data\processed\data_rfms_classified.csv
2025-01-29 07:37:50 - INFO - Successfully loaded data from ..\resources\data\processed\data_rfms_classified.csv
2025-01-29 07:37:50 - INFO - Successfully loaded data from ..\resources\data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95661 entries, 0 to 95660
Data columns (total 30 columns):
 #   Column                    Non-Null Count  Dtype                             
---  ------                    --------------  -----                             
 0   TransactionId             95661 non-null  object                            
 1   BatchId                   95661 non-null  object                            
 2   AccountId                 95661 non-null  object                            
 3   SubscriptionId            95661 non-null  object                            
 4   CustomerId                95661 non-null  object                            
 5   CurrencyCode              95661 non-null  float64                           
 6   CountryCode               95661 non-null  float64                           
 7   ProviderId                95661 non-null  float64                           
 8   ProductId                 95661 non-null  float64                 

In [5]:
drop_columns = ['TransactionStartTime', 'TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId']
target_column = 'FraudResult'
cluster_label = 'RFMS_Label'

data_ready = prepare_for_modelling(data, cluster_label, drop_columns)
data_ready.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95661 entries, 0 to 95660
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CurrencyCode              95661 non-null  float64
 1   CountryCode               95661 non-null  float64
 2   ProviderId                95661 non-null  float64
 3   ProductId                 95661 non-null  float64
 4   ProductCategory           95661 non-null  float64
 5   ChannelId                 95661 non-null  float64
 6   Amount                    95661 non-null  float64
 7   Value                     95661 non-null  float64
 8   PricingStrategy           95661 non-null  float64
 9   FraudResult               95661 non-null  int64  
 10  total_transaction_amount  95661 non-null  float64
 11  avg_transaction_amount    95661 non-null  float64
 12  transaction_count         95661 non-null  float64
 13  std_transaction_amount    95661 non-null  float64
 14  transa

In [None]:
# # Split data to train and test
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(data_final.drop(columns=[target_column]), data_final[target_column], test_size=0.2)
# data_training = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
# data_training

In [7]:
# Initialize and run the model pipeline
credit_model = CreditScoringModel(data_ready, target_column=target_column)
best_credit_scoring_model, test_data = credit_model.run_pipeline()
best_credit_scoring_model

2025-01-29 07:38:50 - INFO - Preprocessing data...
2025-01-29 07:38:50 - INFO - Preprocessing data...
2025-01-29 07:38:50 - INFO - Preprocessing data...
2025-01-29 07:38:50 - INFO - Preprocessing data...
2025-01-29 07:38:50 - INFO - Preprocessing data...
2025-01-29 07:38:50 - INFO - Preprocessing data...
2025-01-29 07:38:50 - INFO - Preprocessing data...
2025-01-29 07:38:51 - INFO - Train shape: (68875, 23), Validation shape: (17219, 23)
2025-01-29 07:38:51 - INFO - Train shape: (68875, 23), Validation shape: (17219, 23)
2025-01-29 07:38:51 - INFO - Train shape: (68875, 23), Validation shape: (17219, 23)
2025-01-29 07:38:51 - INFO - Train shape: (68875, 23), Validation shape: (17219, 23)
2025-01-29 07:38:51 - INFO - Train shape: (68875, 23), Validation shape: (17219, 23)
2025-01-29 07:38:51 - INFO - Train shape: (68875, 23), Validation shape: (17219, 23)
2025-01-29 07:38:51 - INFO - Train shape: (68875, 23), Validation shape: (17219, 23)
2025-01-29 07:38:51 - INFO - Training models...


In [8]:
X_test, y_test = test_data

metrics = credit_model.evaluate_model(best_credit_scoring_model, X_test, y_test)
metrics

{'accuracy': 0.9982230584300199,
 'precision': 0.6666666666666666,
 'recall': 0.3,
 'f1_score': 0.41379310344827586,
 'roc_auc': 0.9713522572535875}