In [1]:
#Import libraries
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
DATA_PATH = 'C:/Users/USER/Documents/Portfolio stuff/Fintech_CLV_Churn_Analysis/data'
INPUT_FEATURES = os.path.join(DATA_PATH, 'customer_features_with_merchant.csv')
OUTPUT_MODEL_DATA = os.path.join(DATA_PATH, 'model_data.csv')

In [4]:
df = pd.read_csv(INPUT_FEATURES)
df.shape

(198576, 6)

### Define Churn Target

In [7]:
df.head()

Unnamed: 0,Customer_ID,Customer_Cohort_Min_Order,Total_Products,Total_Transactions,Recency,Usage_Diversity_Count
0,1,1,8,5,30,2
1,2,1,22,12,13,2
2,3,1,12,6,17,3
3,4,1,1,1,19,1
4,5,1,4,3,19,2


In [9]:
# The max 'days_since_prior_order' in this raw dataset is 30, so any customer at 30 days is at the highest observed risk threshold. 
# I set a threshold of > 28 days for a conservative definition.

Churn_Threshold = 28
df['Churn_Flag'] = np.where(df['Recency'] >= Churn_Threshold, 1, 0)

In [10]:
print(f"-> Defined Churn_Flag (1 = Churned, 0 = Active).")

-> Defined Churn_Flag (1 = Churned, 0 = Active).


In [11]:
churn_rate = df['Churn_Flag'].mean() * 100
print(f"-> Calculated Churn Rate: {churn_rate:.2f}%")

-> Calculated Churn Rate: 25.65%


### Feature Selection
1. Features (X):
    * Total_Products (Monetary proxy)
    * Total_Transactions (Frequency)
    * Recency (Recency, direct predictor)
    * Usage_Diversity_Count (Breadth of Fintech Usage)

2. Target (y):
    * Churn_Flag

In [12]:
feature_cols = [
    'Total_Products', 
    'Total_Transactions', 
    'Recency', 
    'Usage_Diversity_Count'
]

X = df[feature_cols]
y = df['Churn_Flag']

In [13]:
# Data splitting
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
    )

In [15]:
# Standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Final Output

In [None]:
# Covert scaled arrays back to DataFrames
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_cols, index=X_train.index)
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=feature_cols, index=X_test.index)

In [17]:
# Combine
train_df = pd.concat([X_train_scaled_df, y_train], axis=1)
test_df = pd.concat([X_test_scaled_df, y_test], axis=1)

In [18]:
final_model_data = pd.concat([train_df, test_df])
final_model_data.sort_index(inplace=True)

In [19]:
# Save final model data
final_model_data.to_csv(OUTPUT_MODEL_DATA, index=True)

In [22]:
print(f"Final data shape for modeling: {final_model_data.shape}")


Final data shape for modeling: (198576, 5)
