In [8]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np


# Load your merged data
merged = pd.read_parquet(r"E:\OneDrive\Desktop\Customer Retention Analysis\data\processed\merged_data.parquet")

# Convert to datetime
merged['order_purchase_timestamp'] = pd.to_datetime(merged['order_purchase_timestamp'])

# Get the LATEST DATE IN YOUR DATASET (not current date)
last_date_in_data = merged['order_purchase_timestamp'].max()

# Define churn cutoff as 90 days before last date in dataset
cutoff_date = last_date_in_data - pd.DateOffset(days=90)

# Last purchase date per customer
last_purchase = merged.groupby('customer_id')['order_purchase_timestamp'].max().reset_index()

# Calculate churn based on dataset's timeline
last_purchase['churn'] = (last_purchase['order_purchase_timestamp'] < cutoff_date).astype(int)

# Merge with RFM data
rfm = pd.read_csv(r"E:\OneDrive\Desktop\Customer Retention Analysis\data\processed\rfm_data.csv")
rfm_churn = pd.merge(rfm, last_purchase[['customer_id', 'churn']], on='customer_id')

# Verify class balance
print(f"Churn distribution:\n{rfm_churn['churn'].value_counts()}")

# Prepare features
X = rfm_churn[['recency', 'frequency', 'monetary']]
y = rfm_churn['churn']

# Split data with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.3, 
    random_state=42, 
    stratify=y  # Preserves class balance
)


Churn distribution:
churn
1    79859
0    18040
Name: count, dtype: int64


In [9]:
# Check the feature names before training
print("Feature Names Before Training:", X.columns.tolist())

# Ensure input data has column names before prediction
X_test_df = pd.DataFrame(X_test, columns=['recency', 'frequency', 'monetary'])


Feature Names Before Training: ['recency', 'frequency', 'monetary']


In [10]:
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)
print("Feature Names:", X_train.columns)


X_train shape: (68529, 3)
y_train shape: (68529,)
X_test shape: (29370, 3)
y_test shape: (29370,)
Feature Names: Index(['recency', 'frequency', 'monetary'], dtype='object')


In [11]:
# Train the model
model = RandomForestClassifier(
    n_estimators=100, 
    random_state=42,
    class_weight='balanced'  # Handle class imbalance
)
model.fit(X_train, y_train)  # Training the model

# ✅ Verify class distribution before inference
print("Class distribution in training data:")
print(pd.Series(y_train).value_counts())

# ✅ Calculate churn probabilities
proba_output = model.predict_proba(X)  # Apply model to full dataset (X)

# ✅ Debugging: Check if model learned both classes
if proba_output.shape[1] == 1:
    raise ValueError("Model only learned one class! Check your training data balance.")

print(f"Shape of predict_proba output: {proba_output.shape}")

# ✅ Assign churn probabilities (using column index 1 for churn probability)
rfm_churn['churn_probability'] = proba_output[:, 1]

# ✅ Save processed data if needed
rfm_churn.to_csv("rfm_churn_with_probabilities.csv", index=False)



# Create visualizations
# 1. RFM Segment Distribution
fig1 = px.pie(rfm, names='segment', title='Customer Segments Distribution')
fig1.show()

# 2. Churn Probability vs. Monetary Value
fig2 = px.scatter(
    rfm_churn, 
    x='monetary', 
    y='churn_probability',
    color='churn',
    color_discrete_map={0: 'blue', 1: 'red'},
    title='Churn Probability vs. Monetary Value',
    labels={
        'monetary': 'Customer Spend ($)',
        'churn_probability': 'Probability of Churn',
        'churn': 'Churned Customer'
    }
)
fig2.show()

# 3. Recency vs Frequency colored by churn
fig3 = px.scatter(
    rfm_churn,
    x='recency',
    y='frequency',
    color='churn',
    size='monetary',
    color_discrete_map={0: 'blue', 1: 'red'},
    title='Recency vs Frequency (sized by Monetary)',
    labels={
        'recency': 'Days Since Last Purchase',
        'frequency': 'Number of Purchases',
        'monetary': 'Total Spend ($)',
        'churn': 'Churned Customer'
    }
)
fig3.show()


# Save the model results
rfm_churn.to_csv(r"E:\OneDrive\Desktop\Customer Retention Analysis\data\processed\rfm_churn_predictions.csv", index=False)

print("Model and visualizations complete!")
print(f"Model accuracy on test set: {model.score(X_test, y_test):.4f}")
print(f"Number of customers analyzed: {len(rfm_churn)}")
print(f"Churn rate: {rfm_churn['churn'].mean():.2%}")

Class distribution in training data:
churn
1    55901
0    12628
Name: count, dtype: int64
Shape of predict_proba output: (97899, 2)


Model and visualizations complete!
Model accuracy on test set: 0.9991
Number of customers analyzed: 97899
Churn rate: 81.57%


In [7]:
import pickle

# Save the model to a pickle file
with open("E:\\OneDrive\\Desktop\\Customer Retention Analysis\\models\\rfm_churn_model.pkl", "wb") as f:
    pickle.dump(model, f)

print("Model saved successfully!")

Model saved successfully!
