### Initialize

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# Configuration
DATA_PATH = 'C:/Users/USER/Documents/Portfolio stuff/Fintech_CLV_Churn_Analysis/data/'
INPUT_FILE = os.path.join(DATA_PATH, 'processed_features.csv')
OUTPUT_FILE = os.path.join(DATA_PATH, 'customer_features.csv')

In [3]:
# Load data
transaction_log = pd.read_csv(INPUT_FILE)
transaction_log.head()

Unnamed: 0,Customer_ID,Transaction_ID,order_number,days_since_prior_order,order_dow,order_hour_of_day,Merchant_Vertical_ID,Product_SKU
0,202279,2,3,8,5,9,13,9327
1,205970,3,16,12,5,17,16,24838
2,205970,3,16,12,5,17,4,21903
3,178520,4,36,7,1,9,3,46842
4,178520,4,36,7,1,9,19,39758


### Calculate Cohort
Cohort is defined based on the maximum value of the Normalized_Order_Number and represents the customer's lifespan in transactions.

In [None]:
# Find minimum order number for each customer from sampled log
min_order_number = transaction_log.groupby('Customer_ID')['order_number'].min().reset_index()
min_order_number.rename(columns={'order_number': 'Min_Order'}, inplace=True)

In [None]:
# Merge minimum order number back to transaction log
transaction_log = transaction_log.merge(min_order_number, on='Customer_ID', how='left') 
transaction_log.head()

In [21]:
# Create the Normalized Order Number to show the sequence of transactions since the first order
transaction_log['Normalized_Order_Number'] = transaction_log['order_number'] - transaction_log['Min_Order'] + 1

In [None]:
# Cohort is defined based on the starting point (1). Normalized_Order_Number is used for subsequent steps.
cohort_info = transaction_log.groupby('Customer_ID')['Normalized_Order_Number'].min().reset_index()
cohort_info.rename(columns={'Normalized_Order_Number': 'Customer_Cohort_Start'}, inplace=True)

Unnamed: 0,Customer_ID,Customer_Cohort_Start
0,1,1
1,2,1
2,3,1
3,4,1
4,5,1


### Calculate Monetary Value Proxy (Total Products)
There's no explicit monetary value, so it is represented with the number of puchases of products per each customer

In [27]:
# Monetary Value is approximated by the total count of products purchased across all transactions
monetary_value = transaction_log.groupby('Customer_ID')['Product_SKU'].size().reset_index()
monetary_value.rename(columns={'Product_SKU': 'Total_Products'}, inplace=True)

In [30]:
monetary_value.head()

Unnamed: 0,Customer_ID,Total_Products
0,1,8
1,2,22
2,3,12
3,4,1
4,5,4


### Calculate Frequency (Total Transactions)

In [31]:
# Frequency is calculated by total count of unique transactions made by each customer
frequency = transaction_log.groupby('Customer_ID')['Transaction_ID'].nunique().reset_index()
frequency.rename(columns={'Transaction_ID': 'Total_Transactions'}, inplace=True)

In [32]:
frequency.head()

Unnamed: 0,Customer_ID,Total_Transactions
0,1,5
1,2,12
2,3,6
3,4,1
4,5,3


### Calculate Recency (Days Since Last Order)
Recency is defined by the value in days_since_prior_order associated with the customer's *last* transaction.

In [None]:
# Customers' last transaction with Maximum Normalized_Order_Number
last_normalized_order = transaction_log.groupby('Customer_ID')['Normalized_Order_Number'].max().reset_index()

Unnamed: 0,Customer_ID,Normalized_Order_Number
0,1,9
1,2,14
2,3,9
3,4,1
4,5,4


In [35]:
# Merge last normalized order into the log to get corresponding rows to the last transactions
recency_df = last_normalized_order.merge(transaction_log, on=['Customer_ID', 'Normalized_Order_Number'], how='left')

In [None]:
# Create Recency feature from days_since_prior_order of the last transaction
recency_df['Recency'] = recency_df['days_since_prior_order']
recency_df.head()

Unnamed: 0,Customer_ID,Normalized_Order_Number,Transaction_ID,order_number,days_since_prior_order,order_dow,order_hour_of_day,Merchant_Vertical_ID,Product_SKU,Min_Order,Recency
0,1,9,2550362,10,30,4,8,16,38928,2,30
1,1,9,2550362,10,30,4,8,16,35951,2,30
2,2,14,839880,14,13,3,10,16,20785,1,13
3,3,9,676467,10,17,3,16,4,47766,2,17
4,3,9,676467,10,17,3,16,7,1005,2,17


In [45]:
# Drop duplicates from Recency feature
recency_final = recency_df[['Customer_ID', 'Recency']].drop_duplicates()
recency_final.head()

Unnamed: 0,Customer_ID,Recency
0,1,30
2,2,13
3,3,17
5,4,19
6,5,19


### Merge all features into a single customer-level dataframe

In [46]:
customer_features_df = cohort_info.merge(monetary_value, on='Customer_ID')
customer_features_df = customer_features_df.merge(frequency, on='Customer_ID')
customer_features_df = customer_features_df.merge(recency_final, on='Customer_ID')

In [None]:
# Rename for clarity
customer_features_df.rename(columns={'Customer_Cohort_Start': 'Customer_Cohort_Min_Order'}, inplace=True)
customer_features_df.head()

Unnamed: 0,Customer_ID,Customer_Cohort_Min_Order,Total_Products,Total_Transactions,Recency
0,1,1,8,5,30
1,2,1,22,12,13
2,3,1,12,6,17
3,4,1,1,1,19
4,5,1,4,3,19


In [49]:
# Final output
customer_features_df.to_csv(OUTPUT_FILE, index=False)

In [None]:
print(f"Customer Features DataFrame created. Shape: {customer_features_df.shape}")
print(f"Features: {list(customer_features_df.columns)}")
print("Descriptive Statistics for the new features:")
print(customer_features_df[['Customer_Cohort_Min_Order', 'Total_Products', 'Total_Transactions', 'Recency']].describe().T.round(2))

Customer Features DataFrame created. Shape: (198576, 5)
Features: ['Customer_ID', 'Customer_Cohort_Min_Order', 'Total_Products', 'Total_Transactions', 'Recency']
Descriptive Statistics for the new features:
                              count   mean    std  min  25%   50%   75%    max
Customer_Cohort_Min_Order  198576.0   1.00   0.00  1.0  1.0   1.0   1.0    1.0
Total_Products             198576.0  16.33  20.91  1.0  4.0   9.0  20.0  374.0
Total_Transactions         198576.0   9.18  10.33  1.0  3.0   5.0  11.0   94.0
Recency                    198576.0  15.33  10.41  0.0  7.0  13.0  28.0   30.0


: 