In [1]:
%load_ext autoreload

%autoreload 2

In [None]:
%matplotlib inline
import os
import sys
sys.path.append('..')

from scripts.data_utils.cleaner import *
from scripts.modeling.process import process_data
from scripts.data_utils.preprocess import preprocess_data
from scripts.data_utils.loaders import load_data, save_data
from scripts.modeling.orchestrator import *
from scripts.modeling.model import prepare_for_modeling, CreditScoringModel

# import matplotlib
# matplotlib.use('TkAgg')

In [3]:
RESOURCEPATH = os.path.join('..', 'resources')
DATAPATH = os.path.join(RESOURCEPATH, 'data')
raw_data_path = os.path.join(DATAPATH, 'raw')

prepreocessed_output_dir = os.path.join(DATAPATH, 'preprocessed')
processed_output_dir = os.path.join(DATAPATH, 'processed')
plot_output_dir = os.path.join('..', 'screenshots', 'plots')

os.makedirs(prepreocessed_output_dir, exist_ok=True)
os.makedirs(processed_output_dir, exist_ok=True)
os.makedirs(plot_output_dir, exist_ok=True)

In [None]:
filename = 'data.csv'
file_path = os.path.join(raw_data_path, filename)
data_csv = load_data(file_path)
data_csv.shape

In [None]:
filename2 = 'data.xlsx'
file_path2 = os.path.join(raw_data_path, filename2)
data_xlsx = load_data(file_path2, sheet_name='data')
data_xlsx.shape

In [6]:
data = data_xlsx.copy()

# Preprocessed

In [7]:
irrelevant_columns = ['Unnamed: 16', 'Unnamed: 17']
numerical_columns = ['Amount', 'Value', 'PricingStrategy']
categorical_columns = ["CurrencyCode", "CountryCode", "ProviderId", "ProductId", "ProductCategory", "ChannelId"]
date_column = "TransactionStartTime"

missing_value_strategies = {
        "CountryCode": "most_frequent",
        "AccountId": "most_frequent",
        "ProviderId": "most_frequent",
        "PricingStrategy": "median",
        "Value": "mean",
}
dtype_conversions = {
        "CountryCode": "int64",
        "CountryCode": "str",
}

data = data_xlsx.copy()

data_preprocessed = preprocess_data(data, irrelevant_columns, categorical_columns, numerical_columns, 
                                    missing_value_strategies, date_column, dtype_conversions, processed_output_dir)
data_preprocessed

# Feature Engineering

In [68]:
# Define the target feature and feature lists
numerical_columns = ['Amount', 'Value', 'PricingStrategy']
rfms_features = ['Recency', 'Frequency', 'Monetary', 'Intensity', 'Volatility', 'Severity']
temporal_features = ['transaction_hour', 'transaction_day', 'transaction_month', 'transaction_year']
drop_columns = ['TransactionStartTime', 'TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId']
categorical_columns = ['CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId']
aggregated_features = ['total_transaction_amount', 'avg_transaction_amount', 'std_transaction_amount']#, 'transaction_count']

numerical_features = numerical_columns + aggregated_features
columns = categorical_columns + numerical_features
woe_columns = columns + aggregated_features + rfms_features

score_column = rfms_features[2]
cluster_column = rfms_features[0]
# cluster_column = 'Cluster'
date_column = 'TransactionStartTime'
target_column = 'FraudResult'
customer_column = "CustomerId"
customer_label = "RFMS_Label"
recency_column = date_column
frequency_column = 'TransactionId'
amount_column = 'Amount'
monetary_column = 'Value'
severity_column = target_column
timezone = 'Africa/Addis_Ababa'
max_bins = 5

data = data_preprocessed.copy()
scaler= None
data_processed = process_data(data, numerical_features, date_column, customer_column, recency_column, frequency_column, monetary_column, 
             severity_column, target_column, customer_label, columns, rfms_features, scaler, processed_output_dir)

Model

In [None]:
# Initialize and run the model pipeline
data_final = prepare_for_modeling(data_processed, customer_label, drop_columns)
X = data_final.drop(columns=[target_column])
y = data_final[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
credit_model = CreditScoringModel()
best_model = credit_model.train(X_train, y_train, X_test, y_test)
best_model

In [None]:
# Evaluate
metrics = credit_model.evaluate_model(best_model, X_test, y_test)
metrics

# using Transformers

In [None]:
from scripts.modeling.transformers import *
from scripts.data_utils.data_transformers import *

transformation_pipeline = get_transformation_pipeline(
    irrelevant_columns, missing_value_strategies, date_column, categorical_columns, numerical_columns,
    dtype_conversions, timezone, customer_column, amount_column, numerical_features,
    recency_column, frequency_column, monetary_column, severity_column, rfms_features,
    target_column, max_bins, score_column, customer_label
)
transformation_pipeline

In [None]:
data = data_xlsx.copy()
data_processed_tr = transformation_pipeline.fit_transform(data)
data_processed_tr.info()

In [None]:
data_processed_tr.describe().transpose()

Full Pipeline

In [None]:
modeling_pipeline = get_modeling_pipeline(drop_columns, customer_label)
modeling_pipeline

In [169]:
# data_transformed = transformation_pipeline.fit_transform(data)
data_transformed = data_processed_tr.copy()

# data_final = prepare_for_modeling(data_processed_trf, customer_label, drop_columns)
X = data_transformed.drop(columns=[target_column])
y = data_transformed[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# model = CreditScoringModel()
# best_model = model.train(X_train, y_train, X_test, y_test)

In [None]:
full_pipeline = FullPipelineModel(transformation_pipeline, modeling_pipeline)
full_pipeline

In [None]:
full_pipeline.fit(X_train, y_train)

In [None]:
# Save the full pipeline
full_pipeline.save('full_pipeline.pkl')

In [None]:
# Load the full pipeline for inference
loaded_pipeline = FullPipelineModel.load('full_pipeline.pkl')

In [None]:
# Make predictions on new data
# predictions = loaded_pipeline.predict(new_data)