In [4]:
import sys
import os

PROJECT_ROOT = os.path.abspath("..")
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)


In [5]:
import pandas as pd

df = pd.read_csv("../data/raw_data/data.csv")
df.shape


(95662, 16)

In [6]:
customer_agg = (
    df.groupby("CustomerId")
      .agg(
          total_transaction_value=("Amount", "sum"),
          avg_transaction_value=("Amount", "mean"),
          transaction_count=("TransactionId", "count"),
          negative_transaction_count=("Amount", lambda x: (x < 0).sum())
      )
      .reset_index()
)


In [7]:
df["TransactionStartTime"] = pd.to_datetime(df["TransactionStartTime"])

snapshot_date = df["TransactionStartTime"].max() + pd.Timedelta(days=1)

rfm = (
    df.groupby("CustomerId")
      .agg(
          recency=("TransactionStartTime",
                   lambda x: (snapshot_date - x.max()).days),
          frequency=("TransactionId", "count"),
          monetary=("Amount", "sum")
      )
      .reset_index()
)


In [8]:
customer_features = customer_agg.merge(
    rfm, on="CustomerId", how="inner"
)

customer_features.shape


(3742, 8)

In [9]:
customer_features["high_credit_risk"] = (
    customer_features["negative_transaction_count"] > 0
).astype(int)


In [10]:
customer_features.head()


Unnamed: 0,CustomerId,total_transaction_value,avg_transaction_value,transaction_count,negative_transaction_count,recency,frequency,monetary,high_credit_risk
0,CustomerId_1,-10000.0,-10000.0,1,1,84,1,-10000.0,1
1,CustomerId_10,-10000.0,-10000.0,1,1,84,1,-10000.0,1
2,CustomerId_1001,20000.0,4000.0,5,2,90,5,20000.0,1
3,CustomerId_1002,4225.0,384.090909,11,6,26,11,4225.0,1
4,CustomerId_1003,20000.0,3333.333333,6,2,12,6,20000.0,1


In [11]:
customer_features.isnull().sum()


CustomerId                    0
total_transaction_value       0
avg_transaction_value         0
transaction_count             0
negative_transaction_count    0
recency                       0
frequency                     0
monetary                      0
high_credit_risk              0
dtype: int64

In [12]:
customer_features["high_credit_risk"].value_counts(normalize=True)


high_credit_risk
1    0.735168
0    0.264832
Name: proportion, dtype: float64

In [13]:
from src.model_training import (
    split_features_target,
    train_test_data,
    train_logistic_regression,
    evaluate_model
)


In [15]:
X, y = split_features_target(
    customer_features,
    target_col="high_credit_risk"
)


In [16]:
X_train, X_test, y_train, y_test = train_test_data(X, y)


In [17]:
log_model = train_logistic_regression(X_train, y_train)
log_auc, log_report = evaluate_model(log_model, X_test, y_test)

print("Logistic Regression ROC-AUC:", log_auc)


Logistic Regression ROC-AUC: 1.0


In [18]:
import src.model_training as mt
print(dir(mt))


['LogisticRegression', 'RandomForestClassifier', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'classification_report', 'evaluate_model', 'pd', 'roc_auc_score', 'split_features_target', 'train_logistic_regression', 'train_random_forest', 'train_test_data', 'train_test_split']


In [19]:
import sys, os
sys.path.append(os.path.abspath(".."))


In [20]:
import src.model_training as mt
print(dir(mt))


['LogisticRegression', 'RandomForestClassifier', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', 'classification_report', 'evaluate_model', 'pd', 'roc_auc_score', 'split_features_target', 'train_logistic_regression', 'train_random_forest', 'train_test_data', 'train_test_split']


In [21]:
from src.model_training import train_random_forest

rf_model = train_random_forest(X_train, y_train)
rf_auc, rf_report = evaluate_model(rf_model, X_test, y_test)

print("Random Forest ROC-AUC:", rf_auc)


Random Forest ROC-AUC: 0.9999999999999999


In [23]:
import pandas as pd

comparison = pd.DataFrame({
    "Model": ["Logistic Regression", "Random Forest"],
    "ROC-AUC": [log_auc, rf_auc]
})

comparison


Unnamed: 0,Model,ROC-AUC
0,Logistic Regression,1.0
1,Random Forest,1.0
