# Customer Churn Prediction — Bank Customers (Churn Modelling Dataset)

This notebook walks you end-to-end:
1. Load & inspect data  
2. Clean & prepare features  
3. Encode categorical variables (Geography, Gender)  
4. Train & evaluate classifiers (Logistic Regression, Random Forest)  
5. Analyze feature importance
6. Export a ready model with a reusable inference pipeline

> **Dataset**: Use `Churn_Modelling.csv` (commonly available on Kaggle). Place it in the same folder as this notebook.

## 0. Setup

In [3]:

# Core
import pandas as pd
import numpy as np

# Modeling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix, RocCurveDisplay
)
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Plotting
import matplotlib.pyplot as plt

# Utility
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', 100)

print('Libraries loaded.')


Libraries loaded.


## 1. Load & Quick Explore

In [5]:
# --- Update path if needed ---
DATA_PATH = 'Churn_Modelling.csv'

df_raw = pd.read_csv(DATA_PATH)
df = df_raw.copy()

print('Shape:', df.shape)
display(df.head())
display(df.tail(2))
display(df.info())
display(df.isnull().sum())

Shape: (10000, 14)


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.1,0


Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
9998,9999,15682355,Sabbatini,772,Germany,Male,42,3,75075.31,2,1,0,92888.52,1
9999,10000,15628319,Walker,792,France,Female,28,4,130142.79,1,1,0,38190.78,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   RowNumber        10000 non-null  int64  
 1   CustomerId       10000 non-null  int64  
 2   Surname          10000 non-null  object 
 3   CreditScore      10000 non-null  int64  
 4   Geography        10000 non-null  object 
 5   Gender           10000 non-null  object 
 6   Age              10000 non-null  int64  
 7   Tenure           10000 non-null  int64  
 8   Balance          10000 non-null  float64
 9   NumOfProducts    10000 non-null  int64  
 10  HasCrCard        10000 non-null  int64  
 11  IsActiveMember   10000 non-null  int64  
 12  EstimatedSalary  10000 non-null  float64
 13  Exited           10000 non-null  int64  
dtypes: float64(2), int64(9), object(3)
memory usage: 1.1+ MB


None

RowNumber          0
CustomerId         0
Surname            0
CreditScore        0
Geography          0
Gender             0
Age                0
Tenure             0
Balance            0
NumOfProducts      0
HasCrCard          0
IsActiveMember     0
EstimatedSalary    0
Exited             0
dtype: int64

In [6]:
# Drop uninformative identifiers
drop_cols = ['RowNumber', 'CustomerId', 'Surname']
df = df.drop(columns=drop_cols, errors='ignore')

# Sanity checks
assert 'Exited' in df.columns, "Target column 'Exited' not found. Check the dataset."

# Separate target and features
X = df.drop(columns=['Exited'])
y = df['Exited']

# Identify column types
categorical_cols = ['Geography', 'Gender']
numeric_cols = [c for c in X.columns if c not in categorical_cols]

print('Categorical:', categorical_cols)
print('Numeric:', numeric_cols)

Categorical: ['Geography', 'Gender']
Numeric: ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary']


## 3. Train / Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print('Train:', X_train.shape, ' Test:', X_test.shape)
print('Churn rate (train):', y_train.mean().round(4), ' Churn rate (test):', y_test.mean().round(4))

Train: (8000, 10)  Test: (2000, 10)
Churn rate (train): 0.2037  Churn rate (test): 0.2035


## 4. Preprocessing & Encoders

In [8]:

# Categorical: OneHotEncode Geography & Gender
# Numeric: Keep as-is; scale for linear models
categorical_transformer = OneHotEncoder(handle_unknown='ignore', drop='first')

numeric_transformer_for_linear = StandardScaler(with_mean=False)
numeric_transformer_for_tree = 'passthrough'  # Trees don't need scaling

# ColumnTransformers for two families of models
preprocess_linear = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numeric_transformer_for_linear, numeric_cols)
    ],
    remainder='drop'
)

preprocess_tree = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numeric_transformer_for_tree, numeric_cols)
    ],
    remainder='drop'
)
print('Preprocessors ready.')


Preprocessors ready.


## 5. Train Baseline Models

In [10]:

import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

proba = pipe.predict_proba(X_test)[:, 1]
print(f'\n=== {name} ===')
print('Accuracy:', round(accuracy_score(y_test, preds), 4))
print('Precision:', round(precision_score(y_test, preds), 4))
print('Recall:', round(recall_score(y_test, preds), 4))
print('F1 Score:', round(f1_score(y_test, preds), 4))
print('ROC AUC:', round(roc_auc_score(y_test, proba), 4))



=== LogisticRegression ===
Accuracy: 0.7135
Precision: 0.3872
Recall: 0.7002
F1 Score: 0.4987
ROC AUC: 0.7772


### Confusion Matrix & ROC Curve (Best Model)

In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

scores = {}

for name, pipe in models.items():
    # Fit the model
    pipe.fit(X_train, y_train)
    
    # Predictions
    preds = pipe.predict(X_test)
    proba = pipe.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    acc = round(accuracy_score(y_test, preds), 4)
    prec = round(precision_score(y_test, preds), 4)
    rec = round(recall_score(y_test, preds), 4)
    f1 = round(f1_score(y_test, preds), 4)
    auc = round(roc_auc_score(y_test, proba), 4)
    
    # Save AUC score for model comparison
    scores[name] = auc
    
    # Print results
    print(f"\n=== {name} ===")
    print(f"Accuracy : {acc}")
    print(f"Precision: {prec}")
    print(f"Recall   : {rec}")
    print(f"F1 Score : {f1}")
    print(f"ROC AUC  : {auc}")

# Best model selection
best_name = max(scores, key=scores.get)
print(f"\nBest Model: {best_name} with ROC AUC = {scores[best_name]}")



=== LogisticRegression ===
Accuracy : 0.7135
Precision: 0.3872
Recall   : 0.7002
F1 Score : 0.4987
ROC AUC  : 0.7772

=== RandomForest ===
Accuracy : 0.864
Precision: 0.7848
Recall   : 0.457
F1 Score : 0.5776
ROC AUC  : 0.8575

Best Model: RandomForest with ROC AUC = 0.8575


## 6. Feature Importance

In [14]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

scores = {}
fitted_models = {}  # store fitted models here

for name, pipe in models.items():
    # Fit
    pipe.fit(X_train, y_train)
    fitted_models[name] = pipe  # save the trained model
    
    # Predictions
    preds = pipe.predict(X_test)
    proba = pipe.predict_proba(X_test)[:, 1]
    
    # Metrics
    acc = round(accuracy_score(y_test, preds), 4)
    prec = round(precision_score(y_test, preds), 4)
    rec = round(recall_score(y_test, preds), 4)
    f1 = round(f1_score(y_test, preds), 4)
    auc = round(roc_auc_score(y_test, proba), 4)
    
    scores[name] = auc
    
    print(f"\n=== {name} ===")
    print(f"Accuracy : {acc}")
    print(f"Precision: {prec}")
    print(f"Recall   : {rec}")
    print(f"F1 Score : {f1}")
    print(f"ROC AUC  : {auc}")

# Pick best
best_name = max(scores, key=scores.get)
best_model = fitted_models[best_name]  # store the fitted best model
print(f"\nBest Model: {best_name} with ROC AUC = {scores[best_name]}")



=== LogisticRegression ===
Accuracy : 0.7135
Precision: 0.3872
Recall   : 0.7002
F1 Score : 0.4987
ROC AUC  : 0.7772

=== RandomForest ===
Accuracy : 0.864
Precision: 0.7848
Recall   : 0.457
F1 Score : 0.5776
ROC AUC  : 0.8575

Best Model: RandomForest with ROC AUC = 0.8575


## 7. Export Trained Pipeline

In [15]:
import joblib

FINAL_MODEL_PATH = 'churn_pipeline.joblib'
joblib.dump(best_model, FINAL_MODEL_PATH)
print('Saved:', FINAL_MODEL_PATH)


Saved: churn_pipeline.joblib


## 8. Inference Example

In [16]:
# Example: predict churn probability for a single (or few) customers.
# Note: Use original feature names BEFORE preprocessing.

example = pd.DataFrame([{
    'CreditScore': 650,
    'Geography': 'Germany',
    'Gender': 'Female',
    'Age': 40,
    'Tenure': 3,
    'Balance': 120000.0,
    'NumOfProducts': 1,
    'HasCrCard': 1,
    'IsActiveMember': 0,
    'EstimatedSalary': 75000.0
}])

prob = best_model.predict_proba(example)[:,1][0]
pred = int(prob >= 0.5)
print(f'Predicted churn probability: {prob:.3f}  ->  Class: {pred}')


Predicted churn probability: 0.453  ->  Class: 0
