In [5]:
import pandas as pd
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# Load your merged data
merged = pd.read_parquet(r"E:\OneDrive\Desktop\Customer Retention Analysis\data\processed\merged_data.parquet")

# Load your existing RFM data from CSV
rfm = pd.read_csv(r"E:\OneDrive\Desktop\Customer Retention Analysis\data\processed\rfm_data.csv")  # Update this path to your actual RFM file path

# Convert to datetime
merged['order_purchase_timestamp'] = pd.to_datetime(merged['order_purchase_timestamp'])

# Last purchase date per customer
last_purchase = merged.groupby('customer_id')['order_purchase_timestamp'].max().reset_index()

# Define churn
cutoff_date = pd.Timestamp.now() - pd.DateOffset(days=90)
last_purchase['churn'] = (last_purchase['order_purchase_timestamp'] < cutoff_date).astype(int)

# Merge with RFM data
rfm_churn = pd.merge(rfm, last_purchase[['customer_id', 'churn']], on='customer_id')

# Check if we have both classes (0 and 1) in our churn data
print(f"Churn distribution: {rfm_churn['churn'].value_counts()}")

# Prepare features for modeling
X = rfm_churn[['recency', 'frequency', 'monetary']]  # Adjust if your column names are different
y = rfm_churn['churn']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Train a model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Calculate churn probabilities
# Fix for the IndexError - check the shape of predict_proba output
proba_output = model.predict_proba(X)
print(f"Shape of predict_proba output: {proba_output.shape}")

# Safely get probabilities
if proba_output.shape[1] == 1:
    # Only one class in training data
    churn_probabilities = np.zeros(len(X))
elif proba_output.shape[1] == 2:
    # Normal case - two classes
    churn_probabilities = proba_output[:, 1]
else:
    # Multiclass case - choose the class that corresponds to churn
    print("Warning: More than 2 classes detected. Using class 1 as churn.")
    churn_probabilities = proba_output[:, 1]

# Add predictions to rfm_churn dataframe for easy visualization
rfm_churn['churn_probability'] = churn_probabilities

# Create visualizations
# 1. RFM Segment Distribution
fig1 = px.pie(rfm, names='segment', title='Customer Segments Distribution')
fig1.show()

# 2. Churn Probability vs. Monetary Value
fig2 = px.scatter(
    rfm_churn, 
    x='monetary', 
    y='churn_probability',
    color='churn',
    color_discrete_map={0: 'blue', 1: 'red'},
    title='Churn Probability vs. Monetary Value',
    labels={
        'monetary': 'Customer Spend ($)',
        'churn_probability': 'Probability of Churn',
        'churn': 'Churned Customer'
    }
)
fig2.show()

# 3. Recency vs Frequency colored by churn
fig3 = px.scatter(
    rfm_churn,
    x='recency',
    y='frequency',
    color='churn',
    size='monetary',
    color_discrete_map={0: 'blue', 1: 'red'},
    title='Recency vs Frequency (sized by Monetary)',
    labels={
        'recency': 'Days Since Last Purchase',
        'frequency': 'Number of Purchases',
        'monetary': 'Total Spend ($)',
        'churn': 'Churned Customer'
    }
)
fig3.show()


# Save the model results
rfm_churn.to_csv(r"E:\OneDrive\Desktop\Customer Retention Analysis\data\processed\rfm_churn_predictions.csv", index=False)

print("Model and visualizations complete!")
print(f"Model accuracy on test set: {model.score(X_test, y_test):.4f}")
print(f"Number of customers analyzed: {len(rfm_churn)}")
print(f"Churn rate: {rfm_churn['churn'].mean():.2%}")

Churn distribution: churn
1    99441
Name: count, dtype: int64
Shape of predict_proba output: (99441, 1)


Model and visualizations complete!
Model accuracy on test set: 1.0000
Number of customers analyzed: 99441
Churn rate: 100.00%
