In [3]:
import os
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from fontTools.misc.classifyTools import Classifier
from numba.np.random.distributions import random_standard_exponential
from pandas.core.common import random_state
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor 
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error 
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


import warnings
warnings.filterwarnings('ignore')
import pickle 

In [4]:
import kagglehub
path = kagglehub.dataset_download("olistbr/brazilian-ecommerce")
print("path to dataset file:", path)


path to dataset file: /Users/modupeolafagbenro/.cache/kagglehub/datasets/olistbr/brazilian-ecommerce/versions/2


In [5]:
files= os.listdir(path)
print("files inside the directory path and the dataset inside")
for file in files:
    print(file)

files inside the directory path and the dataset inside
olist_sellers_dataset.csv
product_category_name_translation.csv
olist_orders_dataset.csv
olist_order_items_dataset.csv
olist_customers_dataset.csv
olist_geolocation_dataset.csv
olist_order_payments_dataset.csv
olist_order_reviews_dataset.csv
olist_products_dataset.csv


In [6]:
def load_clean_dataset():
    """Load clean preprocessed dataset"""
    try:
        with open("../data/processed/main_df_cleaned.pkl", "rb") as f:
            main_df = pickle.load(f)
            print(f"loading clean data from data preprocessing: {main_df.shape}")
            return main_df
    except FileNotFoundError:
        print("file is not found: Run data preprocessing file first")
        return None
            
            

In [7]:
main_df= load_clean_dataset()

if main_df is not None:
    print(f"dataset in main_df is: {main_df.shape}")
    print(f"{main_df.head(5)}")

loading clean data from data preprocessing: (112650, 33)
dataset in main_df is: (112650, 33)
                           order_id                       customer_id  \
0  e481f51cbdc54678b7cc49136f2d6af7  9ef432eb6251297304e76186b10a928d   
1  53cdb2fc8bc7dce0b6741e2150273451  b0830fb4747a6c6d20dea0b8c802d7ef   
2  47770eb9100c2d0c44946d9cf07ec65d  41ce2a54c0b03bf3443c3d931a367089   
3  949d5b44dbf5de918fe9c16f97b45f8a  f88197465ea7920adcdbec7375364d82   
4  ad21c59c0840e6cb83a9ceb5573f8159  8ab97904e6daea8866dbdbc4fb7aad2c   

  order_status order_purchase_timestamp   order_approved_at  \
0    delivered      2017-10-02 10:56:33 2017-10-02 11:07:15   
1    delivered      2018-07-24 20:41:37 2018-07-26 03:24:27   
2    delivered      2018-08-08 08:38:49 2018-08-08 08:55:23   
3    delivered      2017-11-18 19:28:06 2017-11-18 19:45:59   
4    delivered      2018-02-13 21:18:39 2018-02-13 22:20:29   

  order_delivered_carrier_date order_delivered_customer_date  \
0          2017-10-04 19:

In [8]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 33 columns):
 #   Column                         Non-Null Count   Dtype         
---  ------                         --------------   -----         
 0   order_id                       112650 non-null  object        
 1   customer_id                    112650 non-null  object        
 2   order_status                   112650 non-null  object        
 3   order_purchase_timestamp       112650 non-null  datetime64[ns]
 4   order_approved_at              112650 non-null  datetime64[ns]
 5   order_delivered_carrier_date   111456 non-null  datetime64[ns]
 6   order_delivered_customer_date  110196 non-null  datetime64[ns]
 7   order_estimated_delivery_date  112650 non-null  datetime64[ns]
 8   order_item_id                  112650 non-null  int64         
 9   product_id                     112650 non-null  object        
 10  seller_id                      112650 non-null  object        
 11  

In [9]:
#adding payment table to the existing dataset for the purpose of modeling customer churn modeling 
#merge the payment datadet with thye clean dataset for modeling purposes
payment_df=pd.read_csv(os.path.join(path, "olist_order_payments_dataset.csv"))

#get prinary payment method or order(most common payment method for each order)
primary_payments =payment_df.groupby('order_id').agg({
    'payment_type': lambda x: x.mode()[0], 
    'payment_value': 'sum' 
}).reset_index()

#merge with your current main_df
main_df_with_payments = main_df.merge(primary_payments, on='order_id', how='left')

print(f"Shape before:{main_df.shape}")
print(f"shape after :{main_df_with_payments.shape}")

Shape before:(112650, 33)
shape after :(112650, 35)


In [10]:
main_df_with_payments.columns

Index(['order_id', 'customer_id', 'order_status', 'order_purchase_timestamp',
       'order_approved_at', 'order_delivered_carrier_date',
       'order_delivered_customer_date', 'order_estimated_delivery_date',
       'order_item_id', 'product_id', 'seller_id', 'shipping_limit_date',
       'price', 'freight_value', 'customer_unique_id',
       'customer_zip_code_prefix', 'customer_city', 'customer_state',
       'product_category_name', 'product_name_lenght',
       'product_description_lenght', 'product_photos_qty', 'product_weight_g',
       'product_length_cm', 'product_height_cm', 'product_width_cm',
       'order_year', 'order_month', 'order_day_of_week', 'order_hour',
       'delivery_days', 'total_order_value', 'is_weekend', 'payment_type',
       'payment_value'],
      dtype='object')

In [11]:
main_df_with_payments['payment_type'].nunique()

4

In [12]:
print(main_df_with_payments['payment_type'])

0             voucher
1              boleto
2         credit_card
3         credit_card
4         credit_card
             ...     
112645    credit_card
112646    credit_card
112647    credit_card
112648    credit_card
112649     debit_card
Name: payment_type, Length: 112650, dtype: object


In [14]:
def create_realistic_churn_dataset(df):
    """
    create a realistic churn dataset using recency, frequency, and monetary values + business logic 
    """
    
    print("creating churn definition .......")
    
    #calculate customer-level features from all data 
    customer_summary=df.groupby('customer_id').agg({
        'price':['sum', 'mean', 'count'],
        'freight_value': 'mean',
        'total_order_value': 'mean',
        'product_category_name': lambda x: x.mode().iloc[0] if not x.empty and len(x.mode()) > 0 else 'unknown',
        'product_weight_g': 'mean',
        'customer_state': 'first',
        'payment_type': lambda x : x.mode().iloc[0] if not x.empty and len(x.mode()) > 0 else 'unknown',
        'order_purchase_timestamp': ['min', 'max'],
        'order_month': lambda x : x.mode()[0] if not x.empty and len(x.mode()) > 0 else 1,
        'is_weekend': lambda x : (x == True).mean()
    }).reset_index()
    
    
    #flatten column names 
    customer_summary.columns = ['customer_id', 'total_spent', 'avg_order_value', 'order_frequency', 'avg_freight', 'avg_total_order', 'favorite_category', "avg_product_weight", 'customer_state', 'preferred_payment', 'first_order', 'last_order', 'favorite_month', 'weekend_tendency']
    
    #calculate RFM metrics 
    max_date = df['order_purchase_timestamp'].max()
    customer_summary['recency_days'] = (max_date - customer_summary['last_order']).dt.days
    customer_summary['customer_lifetime_days']=(customer_summary['last_order'] - customer_summary['first_order']).dt.days
    customer_summary['avg_days_between_orders'] = customer_summary['customer_lifetime_days']/ (customer_summary['order_frequency'] + 1)
    
    return customer_summary, max_date


def define_churn_business_logic(customer_data):
    """
    Define churn using business logic:
    -High recency(have not ordered recently)
    -low frequency(few order)
    -low monetary value (small spender)
    """
    
    #Calculate thresholds based on data distributions 
    recency_75 = customer_data['recency_days'].quantile(0.75) #top 25% most recent 
    frequency_25 = customer_data['order_frequency'].quantile(0.25) #buttom 25% frequency
    monetary_25 = customer_data['total_spent'].quantile(0.25) #button 25% spending 
    
    print(f"📊 Churn Thresholds:")
    print(f"Recency threshold: {recency_75:.0f} days")
    print(f"Frequency threshold: {frequency_25:.0f} orders")
    print(f"Monetary threshold: ${monetary_25:.2f}")
          
          
    #define churn: customers who are poor on at least 2 out of 3 RFM metrics 

    high_recency = customer_data['recency_days'] > recency_75
    low_frequency = customer_data['order_frequency'] <= frequency_25
    low_monetary = customer_data['total_spent'] <= monetary_25
    
    #counting how many "bad" RFM scores each customer has
    rfm_score = high_recency.astype(int) + low_frequency.astype(int) + low_monetary.astype(int)
    
    
    #churn if 2 or more poor RFM metrics 
    is_churned =(rfm_score >= 2).astype(int)
    
    return is_churned


def define_churn_percentile_method(customer_data):
    """
    Alternative: used pure percentile-based approach
    
    """
    
    #combine RFM into a single score 
    #Normalize each metric(0-1 scale)
    
    recency_norm = (customer_data['recency_days'] - customer_data['recency_days'].min()) / (customer_data['recency_days'].min())
    frequency_norm= 1- ((customer_data['order_frequency'] - customer_data['order_frequency'].min()) / (customer_data['order_frequency'].max() - customer_data['order_frequency'].min()))
    monetary_norm = 1 - ((customer_data['total_spent'] - customer_data['total_spent'].min()) / (customer_data['total_spent'].max() - customer_data['total_spent'].min()))
    
    #comb ine the 3 churn score 
    churn_score = (recency_norm + frequency_norm + monetary_norm) /3 
    
    #top 3% of churn score = churned 
    churn_threshold = churn_score.quantile(0.7)
    is_churned = (churn_score >= churn_threshold).astype(int)
    
    return is_churned


    
    
    









    

In [15]:
#create customer feature 
customer_features, max_date = create_realistic_churn_dataset(main_df_with_payments
                                                            )
 
print(f"Customer features created is : {customer_features.head()}")

print(f"Customer feature create tail is : {customer_features.tail()}")

print(f"Customer features shape is : {customer_features.shape}")

#Trying both churn definitions
print("\n Method 1: Business Logic Churn")
churn_business = define_churn_business_logic(customer_features)
print(f"Churn business logic distribution:")
print(f"Churn Business metrics is :{churn_business.head()}")

print(f"the unique values of churn business metrics is :{churn_business.nunique()}")
print(f"Count of churn business metrics is : {churn_business.value_counts()}")
print(f"Churn  avg rate: {churn_business.mean() *100:.1f}%")

print("n\ Method 2: Percentile-based Churn")
churn_percentile = define_churn_percentile_method(customer_features)
print("Churn percentile distribution:")
print(f"Churn percentile distribution dataset is :{churn_percentile.head()}")
print(f"Churn percentile distribution dataset shape is :{churn_percentile.shape}")
print(f"Churn Percentile unique value : {churn_percentile.nunique()}")
print(f"Count of  Churn Percentile unique value : {churn_percentile.value_counts()}")
print(f" Churn percentile avg rate is: {churn_percentile.mean() *100:.1f}%")



#best approach to use is the business logic approach

customer_features['is_churned'] = churn_business

print(f"n\ Final churn Dataset")
print(f"shape of the whole dataset is : {customer_features.shape}")
print(f"the head of the dataset is : {customer_features.head()}")
print("Churn distribution:")
print(customer_features['is_churned'].value_counts())
print(f"Final churn rate: {customer_features['is_churned'].mean()*100:.1f}%")

 

creating churn definition .......
Customer features created is :                         customer_id  total_spent  avg_order_value  \
0  00012a2ce6f8dcda20d059ce98491703        89.80            89.80   
1  000161a058600d5901f007fab4c27140        54.90            54.90   
2  0001fd6190edaaf884bcaf3d49edf079       179.99           179.99   
3  0002414f95344307404f0ace7a26f1d5       149.90           149.90   
4  000379cdec625522490c315e70c7a9fb        93.00            93.00   

   order_frequency  avg_freight  avg_total_order favorite_category  \
0                1        24.94        2239.6120        brinquedos   
1                1        12.51         686.7990      beleza_saude   
2                1        15.43        2777.2457             bebes   
3                1        29.45        4414.5550        cool_stuff   
4                1        14.01        1302.9300   cama_mesa_banho   

   avg_product_weight customer_state preferred_payment         first_order  \
0              4267.0

churn dataset preparation for modeling 

In [18]:
#features fro modeling 
churn_features = ['total_spent', 'avg_order_value', 'order_frequency', 'avg_freight', 'avg_total_order','favorite_category', 'avg_product_weight', 'customer_state', 'preferred_payment', 'favorite_month', 'weekend_tendency', 'recency_days', 'customer_lifetime_days', 'avg_days_between_orders']

target_churn = 'is_churned'

#prepare data for modeling 

X_churn =customer_features[churn_features].copy()
y_churn = customer_features[target_churn].copy()


print(f"\nFeatures shape: {X_churn.shape}")
print(f"Target distribution")
print(y_churn.value_counts())
print(X_churn.head(5))


Features shape: (98666, 14)
Target distribution
is_churned
0    58568
1    40098
Name: count, dtype: int64
   total_spent  avg_order_value  order_frequency  avg_freight  \
0        89.80            89.80                1        24.94   
1        54.90            54.90                1        12.51   
2       179.99           179.99                1        15.43   
3       149.90           149.90                1        29.45   
4        93.00            93.00                1        14.01   

   avg_total_order favorite_category  avg_product_weight customer_state  \
0        2239.6120        brinquedos              4267.0             SP   
1         686.7990      beleza_saude               150.0             MG   
2        2777.2457             bebes               750.0             ES   
3        4414.5550        cool_stuff              1600.0             MG   
4        1302.9300   cama_mesa_banho               800.0             SP   

  preferred_payment  favorite_month  weekend_tende

In [20]:
#handling the categorical variables 
categorical_churn_features =['favorite_category', 'customer_state', 'preferred_payment']
churn_encoders = {}

for col in categorical_churn_features:
    le= LabelEncoder()
    X_churn[col] =le.fit_transform(X_churn[col].astype(str))
    churn_encoders[col] =le

#train_test split 
X_train_churn, X_test_churn, y_train_churn, y_test_churn= train_test_split(X_churn, y_churn, test_size=0.2, random_state=42, stratify= y_churn)

print(f"n\Training set: {X_train_churn.shape}")
print(f"n\Test set: {X_test_churn.shape}")
print("Training target distribution")
print(y_train_churn.value_counts())
    

n\Training set: (78932, 14)
n\Test set: (19734, 14)
Training target distribution
is_churned
0    46854
1    32078
Name: count, dtype: int64


In [21]:
#standardization 
churn_scaler = StandardScaler()
X_train_scaled = churn_scaler.fit_transform(X_train_churn)
X_test_scaled = churn_scaler.transform(X_test_churn)


In [22]:
#actual churn modeling 
churn_models= {
"Logstic Regression" : LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced'),
    "Random Forest classifier": RandomForestClassifier(n_estimators= 100, random_state=42, class_weight='balanced'),
    "XGBoostClassifier": XGBClassifier(random_state=42, eval_metrics ='logloss', sclae_pos_weight=2)
}

#train the model 
churn_results ={}
for name, model in churn_models.items():
    print(f"\n======{name} (Customer Churn Prediction) =====")

    if name == "Logistic Regression":
        model.fit(X_train_scaled, y_train_churn)
        y_pred_churn = model.predict(X_test_scaled)
    else:
        model.fit(X_train_churn, y_train_churn)
        y_pred_churn =model.predict(X_test_churn)


    #Metrics
    accuracy= accuracy_score(y_test_churn, y_pred_churn)
    churn_results[name] = {"Accuracy" : accuracy}

    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test_churn, y_pred_churn))

    #compare results 
    churn_df_results = pd.DataFrame(churn_results).T
    print("\n=======Customer Churn Prediction Results Comparison =======")
    print(churn_df_results)
    



Accuracy: 0.7698

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.75      0.79     11714
           1       0.69      0.80      0.74      8020

    accuracy                           0.77     19734
   macro avg       0.77      0.77      0.77     19734
weighted avg       0.78      0.77      0.77     19734


                    Accuracy
Logstic Regression  0.769839

Accuracy: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11714
           1       1.00      1.00      1.00      8020

    accuracy                           1.00     19734
   macro avg       1.00      1.00      1.00     19734
weighted avg       1.00      1.00      1.00     19734


                          Accuracy
Logstic Regression        0.769839
Random Forest classifier  1.000000

Accuracy: 0.9994

Classification Report:
              precision    recall  f1-score   support



Getting the Feature Importance 

In [23]:
if "Random Forest classifier" in churn_models:
    feature_importance = pd.DataFrame({
        'feature': churn_features,
        'importance' : churn_models['Random Forest classifier'].feature_importances_
    }).sort_values('importance', ascending=False)

    print("\nTop 5 Churn Drivers")
    print(feature_importance.head())
    

print("\n Churn prediction model Complete")
print("Realistic churn distribution")
print("Binary classification (0=Stay, 1= Churn")
print("Multiple model compared")
print("Feature Importance identified") 
        


Top 5 Churn Drivers
            feature  importance
11     recency_days    0.445784
0       total_spent    0.236400
1   avg_order_value    0.154590
2   order_frequency    0.069680
4   avg_total_order    0.046057

 Churn prediction model Complete
Realistic churn distribution
Binary classification (0=Stay, 1= Churn
Multiple model compared
Feature Importance identified


In [24]:
print("🌳 WHAT RANDOM FOREST LEARNED:")
print("Decision Tree Logic (simplified):")
print("IF recency_days > 355 AND total_spent <= 45.90:")
print("    → CHURN (1)")
print("ELIF recency_days > 355 AND order_frequency <= 1:")
print("    → CHURN (1)") 
print("ELSE:")
print("    → LOYAL (0)")

# Random Forest captures these patterns PERFECTLY!

🌳 WHAT RANDOM FOREST LEARNED:
Decision Tree Logic (simplified):
IF recency_days > 355 AND total_spent <= 45.90:
    → CHURN (1)
ELIF recency_days > 355 AND order_frequency <= 1:
    → CHURN (1)
ELSE:
    → LOYAL (0)


In [20]:
print("🔍 FEATURE IMPORTANCE ANALYSIS:")
print("1. recency_days (44.3%) - DOMINANT predictor")
print("   • If > 355 days → likely churn")
print("   • Clear threshold-based rule")

print("2. total_spent (24.4%) - Secondary predictor") 
print("   • If < $45.90 → likely churn")
print("   • Another clear threshold")

print("3. avg_order_value (14.6%) - Supporting predictor")
print("   • Correlated with total_spent")

print("4. order_frequency (7.1%) - Minor predictor")
print("   • Most customers have 1 order anyway")

🔍 FEATURE IMPORTANCE ANALYSIS:
1. recency_days (44.3%) - DOMINANT predictor
   • If > 355 days → likely churn
   • Clear threshold-based rule
2. total_spent (24.4%) - Secondary predictor
   • If < $45.90 → likely churn
   • Another clear threshold
3. avg_order_value (14.6%) - Supporting predictor
   • Correlated with total_spent
4. order_frequency (7.1%) - Minor predictor
   • Most customers have 1 order anyway


In [29]:
#saving the best model  -method 1
#with open("logistic_regression_model.pkl", "wb") as f:
    #pickle.dump(churn_models["Logistic Regression"], f)
    
#with open("random_forest_model.pkl", "wb") as f:
    #pickle.dump(churn_models["Random Forest classifier"], f)
    
#with open("xgboost_model.pkl", "wb") as f:
    #pickle.dump(churn_models["XGBoostClassifier"], f)
    
import joblib   
joblib.dump(churn_models["Logstic Regression"], 'logistic_regression_model.joblib')




['logistic_regression_model.joblib']

In [30]:
joblib.dump(churn_models["Random Forest classifier"], 'random_forest_model.joblib')

['random_forest_model.joblib']

In [31]:
joblib.dump(churn_models["XGBoostClassifier"], 'xgboost_model.joblib')


['xgboost_model.joblib']

In [32]:

joblib.dump(churn_scaler, 'churn_scaler.joblib')

['churn_scaler.joblib']

In [33]:
#save tge features 

print("All model and preprocessor saved successfully")
feature_order = churn_features
joblib.dump(feature_order, 'feature_order.pkl')



All model and preprocessor saved successfully


In [34]:
# In your training script, save the most common values
most_common_categories = {
    'favorite_category': X_churn['favorite_category'].mode()[0],
    'customer_state': X_churn['customer_state'].mode()[0], 
    'preferred_payment': X_churn['preferred_payment'].mode()[0]
}
joblib.dump(most_common_categories, 'default_categories.joblib')

['default_categories.joblib']