In [1]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", None)

# Load data
df = pd.read_csv("../data/raw_data/data.csv")

df.head()


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [2]:
customer_agg = df.groupby("CustomerId").agg(
    total_transaction_value=("Amount", "sum"),
    avg_transaction_value=("Amount", "mean"),
    transaction_count=("TransactionId", "count"),
    negative_transaction_count=("Amount", lambda x: (x < 0).sum())
).reset_index()

customer_agg.head()


Unnamed: 0,CustomerId,total_transaction_value,avg_transaction_value,transaction_count,negative_transaction_count
0,CustomerId_1,-10000.0,-10000.0,1,1
1,CustomerId_10,-10000.0,-10000.0,1,1
2,CustomerId_1001,20000.0,4000.0,5,2
3,CustomerId_1002,4225.0,384.090909,11,6
4,CustomerId_1003,20000.0,3333.333333,6,2


In [3]:
customer_agg["high_credit_risk"] = (
    (customer_agg["total_transaction_value"] < 0) |
    (customer_agg["negative_transaction_count"] > 0)
).astype(int)

customer_agg["high_credit_risk"].value_counts(normalize=True)


high_credit_risk
1    0.735168
0    0.264832
Name: proportion, dtype: float64

In [4]:
# Convert transaction time to datetime
df["TransactionStartTime"] = pd.to_datetime(df["TransactionStartTime"])

# Define reference date (latest transaction in dataset)
reference_date = df["TransactionStartTime"].max()
reference_date


Timestamp('2019-02-13 10:01:28+0000', tz='UTC')

In [5]:
rfm = df.groupby("CustomerId").agg(
    recency=("TransactionStartTime", lambda x: (reference_date - x.max()).days),
    frequency=("TransactionId", "count"),
    monetary=("Amount", "sum")
).reset_index()

rfm.head()


Unnamed: 0,CustomerId,recency,frequency,monetary
0,CustomerId_1,83,1,-10000.0
1,CustomerId_10,83,1,-10000.0
2,CustomerId_1001,89,5,20000.0
3,CustomerId_1002,25,11,4225.0
4,CustomerId_1003,11,6,20000.0


In [6]:
features = pd.merge(
    customer_agg,
    rfm,
    on="CustomerId",
    how="inner"
)

features.head()


Unnamed: 0,CustomerId,total_transaction_value,avg_transaction_value,transaction_count,negative_transaction_count,high_credit_risk,recency,frequency,monetary
0,CustomerId_1,-10000.0,-10000.0,1,1,1,83,1,-10000.0
1,CustomerId_10,-10000.0,-10000.0,1,1,1,83,1,-10000.0
2,CustomerId_1001,20000.0,4000.0,5,2,1,89,5,20000.0
3,CustomerId_1002,4225.0,384.090909,11,6,1,25,11,4225.0
4,CustomerId_1003,20000.0,3333.333333,6,2,1,11,6,20000.0


In [7]:
final_features = features.drop(
    columns=[
        "CustomerId",
        "monetary"  # duplicate of total_transaction_value
    ]
)

final_features.head()


Unnamed: 0,total_transaction_value,avg_transaction_value,transaction_count,negative_transaction_count,high_credit_risk,recency,frequency
0,-10000.0,-10000.0,1,1,1,83,1
1,-10000.0,-10000.0,1,1,1,83,1
2,20000.0,4000.0,5,2,1,89,5
3,4225.0,384.090909,11,6,1,25,11
4,20000.0,3333.333333,6,2,1,11,6


In [8]:
X = final_features.drop(columns=["high_credit_risk"])
y = final_features["high_credit_risk"]

X.shape, y.shape


((3742, 6), (3742,))

In [9]:
X.describe().T


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_transaction_value,3742.0,171737.740647,2717305.0,-104900000.0,4077.4375,20000.0,79967.75,83451240.0
avg_transaction_value,3742.0,15715.616012,167699.1,-425000.0,1000.0,2583.846429,4877.613636,8601821.0
transaction_count,3742.0,25.564404,96.9296,1.0,2.0,7.0,20.0,4091.0
negative_transaction_count,3742.0,10.205505,73.17164,0.0,0.0,3.0,7.0,4091.0
recency,3742.0,30.461251,27.11893,0.0,5.0,24.0,53.0,90.0
frequency,3742.0,25.564404,96.9296,1.0,2.0,7.0,20.0,4091.0


## Feature Engineering and Proxy Target Definition

### Proxy Target Variable
The dataset does not include an explicit credit default label. To address this, a proxy target variable (`high_credit_risk`) was constructed based on customer transaction behavior. Customers are classified as high credit risk if they exhibit net negative transaction value or have at least one negative transaction. This approach reflects real-world credit risk assessment practices where repayment behavior and cash flow patterns are used as indicators of default risk in the absence of labeled outcomes.

### Customer-Level Aggregation
Transaction-level data was aggregated to the customer level to prevent data leakage and ensure alignment with credit scoring objectives. Aggregated features include total and average transaction values, transaction frequency, and the count of negative transactions, capturing both volume and quality of customer activity.

### RFM Feature Construction
Recency, Frequency, and Monetary (RFM) features were derived to capture temporal engagement and spending behavior. Recency measures the number of days since the customerâ€™s last transaction, frequency represents transaction count, and monetary reflects cumulative transaction value. These features are widely used in credit risk and behavioral modeling due to their strong interpretability and predictive relevance.

### Final Feature Set
The final dataset consists exclusively of numerical, customer-level features suitable for machine learning models. Identifier columns were removed, and redundant features were excluded to reduce multicollinearity. This results in a clean, model-ready feature matrix and a clearly defined target variable for subsequent modeling tasks.
