* Jonathan Leeper
* Project 3 - Customer Churn Prediction

# E-Commerce Customer Churn Prediction Model

## imports

In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import joblib
import numpy as np
df = pd.read_excel("E Commerce Dataset.xlsx", sheet_name="E Comm")

## Step 1: Import Dataset, Choose Churn as Target, and begin training

In [2]:
X = df.drop('Churn', axis=1)  # Features
y = df['Churn']  # Target

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

## Step 2: Create a Preprocessor for our models to import

In [3]:
# Column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_cols),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))  # Handle unknown categories
        ]), categorical_cols)
    ])

## Step 3: Function for training & Evaluating the model

In [4]:
# Function to train and evaluate models
def train_and_evaluate(model, X_train, y_train, X_test, y_test):
    # Create a pipeline with preprocessing and model
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Fit the model
    pipeline.fit(X_train, y_train)

    # Predict on test data
    y_pred = pipeline.predict(X_test)

    # Evaluate model
    print(f"Model: {model.__class__.__name__}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("="*50)

## Step 4: Training each model

In [5]:
# Models to compare
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

# Train and evaluate each model
for model_name, model in models.items():
    train_and_evaluate(model, X_train, y_train, X_test, y_test)

Model: LogisticRegression
Confusion Matrix:
[[926  15]
 [ 80 105]]

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       941
           1       0.88      0.57      0.69       185

    accuracy                           0.92      1126
   macro avg       0.90      0.78      0.82      1126
weighted avg       0.91      0.92      0.91      1126

Model: DecisionTreeClassifier
Confusion Matrix:
[[919  22]
 [ 27 158]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       941
           1       0.88      0.85      0.87       185

    accuracy                           0.96      1126
   macro avg       0.92      0.92      0.92      1126
weighted avg       0.96      0.96      0.96      1126

Model: RandomForestClassifier
Confusion Matrix:
[[939   2]
 [ 39 146]]

Classification Report:
              precision    recall  f1-score   support

          

## Step 5: Using Hyperparameters to find the best scores & model

In [6]:
# Define the hyperparameter grids for Decision Tree and Random Forest
param_grid_dt = {
    'model__max_depth': [5, 10, 15, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 5]
}

param_grid_rf = {
    'model__n_estimators': [50, 100, 150],
    'model__max_depth': [5, 10, 15],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 5],
    'model__max_features': ['auto', 'sqrt', 'log2']
}

# Create pipelines for Decision Tree and Random Forest with preprocessing
pipeline_dt = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeClassifier(random_state=42))
])

pipeline_rf = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

## Step 6: Using GridSearchCV

In [7]:
# GridSearchCV for Decision Tree
grid_search_dt = GridSearchCV(pipeline_dt, param_grid_dt, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search_dt.fit(X_train, y_train)

# GridSearchCV for Random Forest
grid_search_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train, y_train)

# Best parameters and scores
print("Best parameters for Decision Tree:", grid_search_dt.best_params_)
print("Best score for Decision Tree:", grid_search_dt.best_score_)

print("Best parameters for Random Forest:", grid_search_rf.best_params_)
print("Best score for Random Forest:", grid_search_rf.best_score_)

# Evaluate best models on the test set
best_dt_model = grid_search_dt.best_estimator_
best_rf_model = grid_search_rf.best_estimator_

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best parameters for Decision Tree: {'model__max_depth': 15, 'model__min_samples_leaf': 1, 'model__min_samples_split': 2}
Best score for Decision Tree: 0.9251823899371068
Best parameters for Random Forest: {'model__max_depth': 15, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 100}
Best score for Random Forest: 0.9429419163891971


## Step 7: Using Test Set For Each of the selected best models

In [26]:
# Test set evaluation for Decision Tree
y_pred_dt = best_dt_model.predict(X_test)
print("\nDecision Tree - Classification Report:")
print(classification_report(y_test, y_pred_dt))

# Test set evaluation for Random Forest
y_pred_rf = best_rf_model.predict(X_test)
print("\nRandom Forest - Classification Report:")
print(classification_report(y_test, y_pred_rf))


Decision Tree - Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       941
           1       0.87      0.84      0.86       185

    accuracy                           0.95      1126
   macro avg       0.92      0.91      0.91      1126
weighted avg       0.95      0.95      0.95      1126


Random Forest - Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       941
           1       0.99      0.74      0.84       185

    accuracy                           0.95      1126
   macro avg       0.97      0.87      0.91      1126
weighted avg       0.96      0.95      0.95      1126



**Although the choice between the two models is tough, as recall is a better score for decision tree**, Precision on deciding if Customer Churn will happen is more important in this experiment. In this case, I decided Random Forest is a better model for accurate future predictions. 

## Step 8: Saving the model as a PKL file, for accurate future predictions without needing to train data again.

In [28]:
joblib.dump(best_rf_model, 'CustomerChurnRF.pkl')

loaded_model = joblib.load('CustomerChurnRF.pkl')

['random_forest_model.pkl']

## Final Product: Predicting Customer Churn on New Customer Data

In [49]:
# Column names (use the same ones from your training dataset)
columns = ['CustomerID', 'Tenure', 'PreferredLoginDevice', 'CityTier', 
           'WarehouseToHome', 'PreferredPaymentMode', 'Gender', 
           'HourSpendOnApp', 'NumberOfDeviceRegistered', 'PreferedOrderCat', 
           'SatisfactionScore', 'MaritalStatus', 'NumberOfAddress', 'Complain', 
           'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount', 
           'DaySinceLastOrder', 'CashbackAmount']

# Example of a new customer
new_customer_data = np.array([[10001,   # CustomerID (irrelevant)
                               3,       # Tenure
                               0,       # PreferredLoginDevice (0 = Mobile, 1 = Phone)
                               2,       # CityTier
                               10,      # WarehouseToHome
                               3,       # PreferredPaymentMode (coded as an integer, same as login device & gender)
                               1,       # Gender (0 = Female, 1 = Male)
                               2,       # HourSpendOnApp
                               2,       # NumberOfDeviceRegistered
                               1,       # PreferedOrderCat (coded, same as the other integers)
                               3,       # SatisfactionScore
                               1,       # MaritalStatus (coded)
                               5,       # NumberOfAddress
                               0,       # Complain
                               5,       # OrderAmountHikeFromlastYear
                               0,       # CouponUsed
                               3,       # OrderCount
                               10,      # DaySinceLastOrder
                               135]])  # CashbackAmount (numeric)

# Create a DataFrame with the new data, using the column names from the training data
new_data_df = pd.DataFrame(new_customer_data, columns=columns)

# Now you can use the transformed data for prediction
prediction = loaded_model.predict(new_data_df)

print(f"Prediction (0 = Not churned, 1 = Churned): {prediction}")

Prediction (0 = Not churned, 1 = Churned): [0]


As you can see above, With our new customer data, we were able to determine that this customer will not churn.