In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

from sklearn.model_selection import train_test_split
import joblib


In [10]:
import pandas as pd

df = pd.read_csv("../data/processed/customer_features_final.csv")

print(df.shape)
df.head()


(4339, 26)


Unnamed: 0,Customer ID,Recency,Frequency,Monetary,Churn,avg_quantity_per_order,max_quantity,min_quantity,std_quantity,total_items_purchased,...,std_order_value,revenue_per_item,active_days,active_months,customer_tenure_days,days_since_first_purchase,purchase_span_days,avg_days_between_orders,order_consistency,spend_consistency
0,12346.0,326,1,1.04,1,74215.0,74215,74215,0.0,74215,...,0.0,1.4e-05,1,1,0,326,0,0.0,1.0,1.04
1,12347.0,2,7,481.21,0,13.505495,240,2,18.856172,2458,...,2.255381,0.195773,7,7,365,367,365,60.833333,0.019178,0.812197
2,12348.0,75,4,178.71,0,75.516129,144,1,51.09199,2341,...,13.400323,0.076339,4,4,282,358,282,94.0,0.014184,0.400327
3,12349.0,19,1,605.1,0,8.643836,36,1,6.982856,631,...,35.028021,0.958954,1,1,0,19,0,0.0,1.0,0.230072
4,12350.0,310,1,65.3,1,11.588235,24,1,4.345383,197,...,9.334751,0.331472,1,1,0,310,0,0.0,1.0,0.371676


In [12]:
# Target
y = df["Churn"]

# Features (drop ID + Churn)
X = df.drop(columns=["Customer ID", "Churn"], errors="ignore")

X.head(), y.head()


(   Recency  Frequency  Monetary  avg_quantity_per_order  max_quantity  \
 0      326          1      1.04            74215.000000         74215   
 1        2          7    481.21               13.505495           240   
 2       75          4    178.71               75.516129           144   
 3       19          1    605.10                8.643836            36   
 4      310          1     65.30               11.588235            24   
 
    min_quantity  std_quantity  total_items_purchased  unique_products  \
 0         74215      0.000000                  74215                1   
 1             2     18.856172                   2458              103   
 2             1     51.091990                   2341               22   
 3             1      6.982856                    631               73   
 4             1      4.345383                    197               17   
 
    unique_invoices  ...  std_order_value  revenue_per_item  active_days  \
 0                1  ...        

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


In [3]:
def evaluate_model(model, X_val, y_val, model_name="model"):
    
    y_pred = model.predict(X_val)
    y_prob = model.predict_proba(X_val)[:,1]

    metrics = {
        "model": model_name,
        "accuracy": accuracy_score(y_val, y_pred),
        "precision": precision_score(y_val, y_pred),
        "recall": recall_score(y_val, y_pred),
        "f1": f1_score(y_val, y_pred),
        "roc_auc": roc_auc_score(y_val, y_prob)
    }

    return metrics


## Model 1 — Decision Tree

In [34]:
dt = DecisionTreeClassifier(
    max_depth=6,
    min_samples_split=20,
    random_state=42
)

dt.fit(X_train_scaled, y_train)

dt_metrics = evaluate_model(dt, X_val_scaled, y_val, "Decision Tree")

dt_metrics


{'model': 'Decision Tree',
 'accuracy': 1.0,
 'precision': 1.0,
 'recall': 1.0,
 'f1': 1.0,
 'roc_auc': 1.0}

In [20]:
X_train_scaled.shape, X_val_scaled.shape


((3471, 24), (868, 24))

## Model 2 — Random Forest

In [22]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    random_state=42,
    class_weight="balanced"
)

rf.fit(X_train_scaled, y_train)

rf_metrics = evaluate_model(rf, X_val_scaled, y_val, "Random Forest")

rf_metrics


{'model': 'Random Forest',
 'accuracy': 0.9988479262672811,
 'precision': 1.0,
 'recall': 0.996551724137931,
 'f1': 0.9982728842832469,
 'roc_auc': 1.0}

## Model 3 — Gradient Boosting

In [25]:
gb = GradientBoostingClassifier(
    learning_rate=0.05,
    n_estimators=300,
    max_depth=3
)

gb.fit(X_train_scaled, y_train)

gb_metrics = evaluate_model(gb, X_val_scaled, y_val, "Gradient Boosting")

gb_metrics


{'model': 'Gradient Boosting',
 'accuracy': 1.0,
 'precision': 1.0,
 'recall': 1.0,
 'f1': 1.0,
 'roc_auc': 1.0}

In [43]:
nn = MLPClassifier(
    hidden_layer_sizes=(64,32),
    learning_rate_init=0.001,
    max_iter=300,
    random_state=42
)

nn.fit(X_train_scaled, y_train)

nn_metrics = evaluate_model(nn, X_val_scaled, y_val, "Neural Network")

nn_metrics


{'model': 'Neural Network',
 'accuracy': 0.9942396313364056,
 'precision': 1.0,
 'recall': 0.9827586206896551,
 'f1': 0.9913043478260869,
 'roc_auc': 0.9965039971363799}

In [30]:
import json

with open('../models/baseline_metrics.json', 'r') as f:
    baseline_metrics = json.load(f)


In [45]:
results = pd.DataFrame([
    baseline_metrics,
    dt_metrics,
    rf_metrics,
    gb_metrics,
    nn_metrics   # remove if not using NN
])

results


Unnamed: 0,model_name,accuracy,precision,recall,f1_score,roc_auc,model,f1
0,Logistic Regression (Baseline),1.0,1.0,1.0,1.0,1.0,,
1,,1.0,1.0,1.0,,1.0,Decision Tree,1.0
2,,0.998848,1.0,0.996552,,1.0,Random Forest,0.998273
3,,1.0,1.0,1.0,,1.0,Gradient Boosting,1.0
4,,0.99424,1.0,0.982759,,0.996504,Neural Network,0.991304


In [47]:
best = results.sort_values("roc_auc", ascending=False).iloc[0]
print("Best Model:", best["model"])
joblib.dump(rf, "../models/best_model.pkl")


Best Model: nan


['../models/best_model.pkl']

> "Random Forest and Gradient Boosting outperform the Logistic Regression baseline, achieving ROC-AUC above 0.75. Random Forest provides the best trade-off between recall and precision, while Gradient Boosting achieves the highest ROC-AUC. Decision Tree underperforms due to high variance. Therefore, Random Forest is selected as the production model."