# Customer Churn Prediction - Machine Learning Model
# This notebook continues from RFM_Analysis.ipynb


In [None]:
# Part 1: Setup and Data Loading

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

print("Libraries loaded successfully")

Libraries loaded successfully


In [4]:
## Part 2: Load RFM Analysis Results

# Load the output from RFM analysis
df_rfm = pd.read_csv('customer_features.csv')
print(f"Data loaded: {df_rfm.shape}")
print(f"Columns: {df_rfm.columns.tolist()}")

# Verify churn definition (should be 120-day based on RFM analysis)
print(f"\nChurn rate (120-day threshold): {df_rfm['Churn'].mean():.1%}")
# Display first few rows
display(df_rfm.head())

Data loaded: (3212, 23)
Columns: ['CustomerID', 'Recency', 'Frequency', 'Monetary', 'Churn', 'R_Score', 'F_Score', 'M_Score', 'RFM_Score_Combined', 'avg_order_value', 'std_order_value', 'min_order_value', 'max_order_value', 'avg_items_per_order', 'total_items_purchased', 'unique_purchase_days', 'customer_lifetime_days', 'purchase_frequency_days', 'days_since_first_purchase', 'is_uk', 'value_per_frequency', 'consistency_ratio', 'purchase_acceleration']

Churn rate (120-day threshold): 28.6%


Unnamed: 0,CustomerID,Recency,Frequency,Monetary,Churn,R_Score,F_Score,M_Score,RFM_Score_Combined,avg_order_value,...,avg_items_per_order,total_items_purchased,unique_purchase_days,customer_lifetime_days,purchase_frequency_days,days_since_first_purchase,is_uk,value_per_frequency,consistency_ratio,purchase_acceleration
0,12346.0,205,1,77183.6,1,1,1,4,6,77183.6,...,74215.0,74215,1,0,0.0,205,1,77183.6,0.0,0.004854
1,12347.0,9,124,2790.86,0,4,4,4,12,22.506935,...,12.822581,1590,5,237,47.4,246,0,22.506935,1.065328,0.502024
2,12348.0,128,28,1487.24,1,2,2,4,8,53.115714,...,75.857143,2124,3,109,36.333333,237,0,53.115714,0.851359,0.117647
3,12350.0,189,17,334.4,1,1,2,2,5,19.670588,...,11.588235,197,1,0,0.0,189,0,19.670588,0.351975,0.089474
4,12352.0,141,38,1561.81,1,1,3,4,8,41.100263,...,6.684211,254,5,34,6.8,176,0,41.100263,1.7751,0.214689


## Part 3: Data Leakage Detection
### 3.1 Testing with 90-Day Threshold (Original Approach)

In [6]:
# Create 90-day churn to demonstrate why it causes leakage
df_90 = df_rfm.copy()
df_90['Churn_90'] = (df_90['Recency'] > 90).astype(int)

print("Comparison of Churn Definitions:")
churn_comparison = pd.DataFrame({
    'Threshold': ['90 days', '120 days'],
    'Churn Rate': [f"{df_90['Churn_90'].mean():.1%}", f"{df_rfm['Churn'].mean():.1%}"],
    'Total Churned': [df_90['Churn_90'].sum(), df_rfm['Churn'].sum()]
})
display(churn_comparison)

# Test models with 90-day threshold
X_90 = df_90.drop(['CustomerID', 'Churn', 'Churn_90'], axis=1)
y_90 = df_90['Churn_90']

X_train_90, X_test_90, y_train_90, y_test_90 = train_test_split(
    X_90, y_90, test_size=0.2, random_state=42, stratify=y_90
)

# Scale features
scaler_90 = StandardScaler()
X_train_scaled_90 = scaler_90.fit_transform(X_train_90)
X_test_scaled_90 = scaler_90.transform(X_test_90)

# Test multiple models
models_90 = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

print("\nModel Performance with 90-day threshold:")
results_90 = []
for name, model in models_90.items():
    model.fit(X_train_scaled_90, y_train_90)
    acc = model.score(X_test_scaled_90, y_test_90)
    results_90.append({'Model': name, 'Accuracy': f"{acc:.1%}"})
    
display(pd.DataFrame(results_90))

Comparison of Churn Definitions:


Unnamed: 0,Threshold,Churn Rate,Total Churned
0,90 days,38.4%,1234
1,120 days,28.6%,920



Model Performance with 90-day threshold:


Unnamed: 0,Model,Accuracy
0,Logistic Regression,99.5%
1,Random Forest,100.0%
2,Gradient Boosting,100.0%


### 3.2 Feature Correlation Analysis

In [None]:
# Check correlation with churn
correlations = pd.DataFrame({
    'Feature': X_90.columns,
    'Correlation': [abs(df_90[col].corr(df_90['Churn_90'])) for col in X_90.columns]
}).sort_values('Correlation', ascending=False)

print("\nTop Features Correlated with Churn:")
display(correlations.head(5))


Top Features Correlated with Churn:


Unnamed: 0,Feature,Correlation
0,Recency,0.854285
3,R_Score,0.813545
6,RFM_Score_Combined,0.642641
14,customer_lifetime_days,0.514884
5,M_Score,0.411626


## Part 4: Clean Model Development

### 4.1 Create Clean Feature Set

In [12]:
# Remove leaky features identified from correlation analysis
features_to_remove = ['Recency', 'R_Score', 'days_since_first_purchase']

# Prepare data with 120-day churn (production threshold)
X = df_rfm.drop(['CustomerID', 'Churn'], axis=1)
y = df_rfm['Churn']

# Create clean features
X_clean = X.drop(columns=[col for col in features_to_remove if col in X.columns])

print(f"Original features: {X.shape[1]}")
print(f"Features removed: {features_to_remove}")
print(f"Clean features: {X_clean.shape[1]}")

Original features: 21
Features removed: ['Recency', 'R_Score', 'days_since_first_purchase']
Clean features: 18


### 4.2 Train-Test Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y, test_size=0.2, random_state=42, stratify=y
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set: {X_train_scaled.shape}")
print(f"Test set: {X_test_scaled.shape}")


Training set: (2569, 18)
Test set: (643, 18)


### 4.3 Model Training and Evaluation

In [16]:
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42)
}

results = []
for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1-Score': f1_score(y_test, y_pred)
    })

results_df = pd.DataFrame(results).round(3)
print("\nFinal Model Performance (Clean Features):")
display(results_df)
#!/usr/bin/env python3  


Final Model Performance (Clean Features):


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Logistic Regression,0.956,1.0,0.848,0.918
1,Random Forest,0.949,0.958,0.859,0.905
2,Gradient Boosting,0.955,0.975,0.864,0.916


## Part 5: Summary

In [17]:
comparison_summary = pd.DataFrame({
    'Scenario': ['90-day with leakage', '120-day clean features'],
    'Best Accuracy': ['~100%', '~95%'],
    'Key Issue': ['Data leakage via Recency', 'None - Production ready']
})

print("\nData Leakage Resolution Summary:")
display(comparison_summary)


Data Leakage Resolution Summary:


Unnamed: 0,Scenario,Best Accuracy,Key Issue
0,90-day with leakage,~100%,Data leakage via Recency
1,120-day clean features,~95%,None - Production ready
