In [1]:
!pip install imblearn

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
# 1. Import all the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score,roc_auc_score
from imblearn.over_sampling import SMOTE
import joblib

In [3]:
# 2.Load the dataseta
df = pd.read_csv("ecommerce_churn_dataset.csv")
df.head()

Unnamed: 0,order_count,total_spent,last_month_expenditure,coupon_used,preferred_category,payment_method,location_type,rating_last_purchase,churned
0,7,16295,4858,No,Apparel,COD,Tier 1,5,0
1,19,4926,723,No,Books,Credit Card,Tier 1,3,0
2,2,23833,0,Yes,Apparel,COD,Tier 2,4,0
3,12,20269,3732,Yes,Apparel,UPI,Tier 2,1,0
4,10,18931,0,Yes,Books,Credit Card,Tier 1,3,0


In [4]:
# 3. Basic Info
df.info()
df.describe()
df.isnull().sum()  # Check for missing values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   order_count             10000 non-null  int64 
 1   total_spent             10000 non-null  int64 
 2   last_month_expenditure  10000 non-null  int64 
 3   coupon_used             10000 non-null  object
 4   preferred_category      10000 non-null  object
 5   payment_method          10000 non-null  object
 6   location_type           10000 non-null  object
 7   rating_last_purchase    10000 non-null  int64 
 8   churned                 10000 non-null  int64 
dtypes: int64(5), object(4)
memory usage: 703.3+ KB


order_count               0
total_spent               0
last_month_expenditure    0
coupon_used               0
preferred_category        0
payment_method            0
location_type             0
rating_last_purchase      0
churned                   0
dtype: int64

In [5]:
# 4. Data Preparation
# Step 1: Data Labeling
# So, no need to map anything
df.head()

# 🔍 Class Distribution:
print("🔍 Class Distribution:")
print(df['churned'].value_counts())
# 📊 Percentage Distribution:
print("\n📊 Percentage Distribution:")
print(df['churned'].value_counts(normalize=True) * 100)


🔍 Class Distribution:
churned
0    6768
1    3232
Name: count, dtype: int64

📊 Percentage Distribution:
churned
0    67.68
1    32.32
Name: proportion, dtype: float64


In [6]:
# Step 2: Data Reduction - Drop all the unimportant features for training the model

# df = df.drop(columns=['customer_id']) -column removed before generation
df.head(20)

Unnamed: 0,order_count,total_spent,last_month_expenditure,coupon_used,preferred_category,payment_method,location_type,rating_last_purchase,churned
0,7,16295,4858,No,Apparel,COD,Tier 1,5,0
1,19,4926,723,No,Books,Credit Card,Tier 1,3,0
2,2,23833,0,Yes,Apparel,COD,Tier 2,4,0
3,12,20269,3732,Yes,Apparel,UPI,Tier 2,1,0
4,10,18931,0,Yes,Books,Credit Card,Tier 1,3,0
5,20,18412,0,No,Books,COD,Tier 1,3,0
6,9,15002,0,No,Electronics,COD,Tier 1,1,1
7,18,10733,0,No,Books,Credit Card,Tier 2,5,0
8,15,23983,0,Yes,Food,COD,Tier 2,5,0
9,3,15041,1342,No,Books,Credit Card,Tier 2,4,0


In [7]:
# Step 3: Data Wrangling = cleaning + reshaping + transforming data 

# Cleaning- fixing data types, handling missing values, removing invalids and duplicates
numeric_cols = ['total_spent', 'last_month_expenditure', 'order_count', 'rating_last_purchase']
for col in numeric_cols:
    df[col] = df[col].replace('₹', '', regex=True)
    df[col] = pd.to_numeric(df[col], errors='coerce')     # Convert to numeric, turn errors into NaN

df = df[(df['rating_last_purchase'] >= 1) & (df['rating_last_purchase'] <= 5)]
df = df.dropna()
df = df.drop_duplicates().reset_index(drop=True)

# Reshaping- drop unnecessary columns, structure and reorder the columns
# Already structured here

# Transformations- Encoding - optional for Random Forest or Tree-based models
df_encoded = df.copy()
cat_cols = ['coupon_used', 'preferred_category', 'payment_method', 'location_type']
label_encoders = {}

for col in cat_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le  # save encoder for decoding later if needed

df_encoded['churned'] = df_encoded['churned'].astype(int)
df_encoded.head()

Unnamed: 0,order_count,total_spent,last_month_expenditure,coupon_used,preferred_category,payment_method,location_type,rating_last_purchase,churned
0,7,16295,4858,0,0,0,0,5,0
1,19,4926,723,0,1,1,0,3,0
2,2,23833,0,1,0,0,1,4,0
3,12,20269,3732,1,0,2,1,1,0
4,10,18931,0,1,1,1,0,3,0


In [8]:
# Step 4: Feature Engineering = Feature scaling (normalizing) + Separating Features and Target
# Adding derived features for better model performance
# Average Order Value
df_encoded['avg_order_value'] = df_encoded.apply(
    lambda row: row['total_spent'] / row['order_count'] if row['order_count'] > 0 else 0,
    axis=1
)

# Spend Change (recent vs. total)
df_encoded['spend_change'] = df_encoded.apply(
    lambda row: row['last_month_expenditure'] / (row['total_spent'] - row['last_month_expenditure'])
    if (row['total_spent'] - row['last_month_expenditure']) > 0 else 0,
    axis=1
)

# High Rating Flag
df_encoded['is_high_rating'] = (df_encoded['rating_last_purchase'] >= 4).astype(int)

# Category Frequency Encoding
cat_freq = df_encoded['preferred_category'].value_counts(normalize=True).to_dict()
df_encoded['category_encoded'] = df_encoded['preferred_category'].map(cat_freq)

# No Purchase Last Month
df_encoded['no_purchase_last_month'] = (df_encoded['last_month_expenditure'] == 0).astype(int)

# Order Frequency Score (proxy: orders per 1000 currency spent)
df_encoded['order_frequency_score'] = df_encoded.apply(
    lambda row: row['order_count'] / (row['total_spent'] / 1000) if row['total_spent'] > 0 else 0,
    axis=1
)

# Spending Trend (last month vs average order value)
df_encoded['spend_trend'] = df_encoded.apply(
    lambda row: (row['last_month_expenditure'] / row['avg_order_value']) if row['avg_order_value'] > 0 else 0,
    axis=1
)

# Columns to scale -numeric only
cols_to_scale = [
    'order_count',
    'total_spent',
    'last_month_expenditure',
    'rating_last_purchase',
    'avg_order_value',
    'spend_change',
    'category_encoded',
    'order_frequency_score',
    'spend_trend'
]
scaler = StandardScaler()
df_scaled = df_encoded.copy()
df_scaled[cols_to_scale] = scaler.fit_transform(df_scaled[cols_to_scale])

# Separating features and target
X = df_scaled.drop(columns=['churned'],axis=1)
y = df_scaled['churned'] 

In [9]:
# 5. Model Training & Evaluation

# Split into train and test (with stratify to preserve class balance in both sets)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply SMOTE only to the training set
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Initialize the model
models = {
    "Random Forest": RandomForestClassifier(
        n_estimators=500,
        max_depth=15,
        min_samples_split=4,
        min_samples_leaf=1,
        class_weight='balanced',
        random_state=42
    ),

    "Logistic Regression": LogisticRegression(
        max_iter=2000,
        class_weight='balanced',
        random_state=42
    ),

    "XGBoost": XGBClassifier(
        n_estimators=600,
        learning_rate=0.03,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        scale_pos_weight=len(y_train[y_train == 0]) / len(y_train[y_train == 1]),
        reg_lambda=1.0,
        gamma=0.1,
        eval_metric='logloss',
        use_label_encoder=False,
        random_state=42
    )
}

# Train, predict, and evaluate each model
for name, model in models.items():
    print(f"\n📌 Model: {name}")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\n ROC AUC:", roc_auc_score(y_test, y_prob))
    print("\n Classification Report:\n", classification_report(y_test, y_pred))
    print("\n Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


📌 Model: Random Forest
Accuracy: 0.729

 ROC AUC: 0.7679893538695117

 Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.78      0.80      1354
           1       0.57      0.63      0.60       646

    accuracy                           0.73      2000
   macro avg       0.69      0.70      0.70      2000
weighted avg       0.74      0.73      0.73      2000


 Confusion Matrix:
 [[1054  300]
 [ 242  404]]

📌 Model: Logistic Regression
Accuracy: 0.6755

 ROC AUC: 0.7336855367195467

 Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.68      0.74      1354
           1       0.50      0.66      0.57       646

    accuracy                           0.68      2000
   macro avg       0.65      0.67      0.65      2000
weighted avg       0.71      0.68      0.68      2000


 Confusion Matrix:
 [[922 432]
 [217 429]]

📌 Model: XGBoost


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.7375

 ROC AUC: 0.7757132861696339

 Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.81      0.81      1354
           1       0.59      0.59      0.59       646

    accuracy                           0.74      2000
   macro avg       0.70      0.70      0.70      2000
weighted avg       0.74      0.74      0.74      2000


 Confusion Matrix:
 [[1095  259]
 [ 266  380]]


In [10]:
# Save Random Forest model, encoders, and scaler
joblib.dump(models["Random Forest"], "random_forest_model.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")
joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']