In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import matplotlib.pyplot as plt
import joblib

from src.data.make_data import create_bins
from src.visualizations.plots import plot_churn_metrics
from src.models.evaluate import calculate_segment_value, plot_threshold_analysis_grid

In [None]:
# Load data
df_raw = pd.read_csv('../data/raw/raw_data.csv')

In this notebook our goal is to identify segments of the customer base deemed to be high value. More precisely we will:
1. Find segments that can be identified using simple rule-based decisions.
2. Evaluate the performance our predictive model with respect to this metric.

We first make precise what we mean by the value of a segment of the customer base. Specifically we define the value of a segment of the population as the ratio
$$
\text{Value of the Segment} = \frac{\text{\% Churned Balance from the Segment}}{\text{\% Population in the Segment}}.
$$
A high value segment, say value > 2, indicates that one would recover atleast twice the amount of churned balance when targeting that segment compared to a random targeting of a group of customers of the same size.

In [None]:
# Baseline churne metrics
total_churned_balance = df_raw[df_raw['Exited'] == 1]['Balance'].sum()
overall_churn_rate = df_raw['Exited'].mean()
print(f"Total balance lost to churn: {total_churned_balance}")
print(f"Overall churn rate: {100*overall_churn_rate:.2f}%")

# Rule-Based

In [None]:
# Create bins for numerical values
bins_dict = {
    'Age': [-float("inf"), 30, 40, 50, 60, 70, float("inf")],
    'CreditScore': [-float("inf"), 579, 669, 739, 799, float("inf")],
    'Balance': [-float("inf"), 0, 100000, 125000,150000, float("inf")],
}

labels_dict = {
    'Age': ['<30', '30-40', '40-50', '50-60', '60-70', '>70'],
    'CreditScore': ['Poor', 'Fair', 'Good', 'Very Good', 'Excellent'],
    'Balance': ['0', '1-100K', '100K-125K', '125K-150K', '>150K'],
}

df_binned = create_bins(df_raw, bins_dict, labels_dict)
df_binned

In [None]:
bin_features = ['Geography', 'Gender', 'Tenure', 'NumOfProducts', 'IsActiveMember', 'Age_binned', 'CreditScore_binned', 'Balance_binned']
fig = plt.figure(figsize=(12, 6*4))

subfigs = fig.subfigures(4, 2, hspace=0.0005, wspace=0.0005)

for i, feature in enumerate(bin_features):
    subfig = subfigs[i // 2, i % 2]
    axes = subfig.subplots(2, 1)
    plot_churn_metrics(feature, df_binned, axes[0], axes[1])
    subfig.subplots_adjust(hspace=0.05)


plt.show()

Segments which are both of high value and churn rate are good targets for reducing churners as retaining these churners will provide a higher return in balance than average as well as less resources wasted on converting customers who are already staying.
- **Customers with 4 or 3 products respectively**
- **Customers in the age bracket 50-60**
- **German based customers**

For segments which are of high or above average value but have lower churn rates, there is a lower return (compared to the above) when retaining as well as a higher risk of using resource on customers who are not churning. Instead these should be indicators that can be combined when deciding when to reach out to customers.
- Customers with balance of >100K
- Customers in the age brackets 40-50, 60-70

In [None]:
# Churn rate and value of Customers with balance >100K and age bracket 40-50 or 60-70
combo_segment = (df_binned['Balance_binned'].isin(['100K-125K', '125K-150K', '>150K'])) & (df_binned['Age_binned'].isin(['40-50', '60-70']))

print(f"Value (Combo): {calculate_segment_value(df_binned, combo_segment):.3f}")
print(f"Churn rate (Combo): {100*(df_binned[combo_segment]['Exited'].mean()):.2f}%")

We see that customers who fall into both of these less indicative customer base are of high value and churn rate.

In [None]:
# Churn rate and value of Customers to be targeted
target_segment = (df_binned['NumOfProducts'] >= 3) | (df_binned['Age_binned'] == '50-60') | (df_binned['Geography'] == 'Germany') | ((df_binned['Balance_binned'].isin(['100K-125K', '125K-150K', '>150K'])) & (df_binned['Age_binned'].isin(['40-50', '60-70'])))

print(f"Value (Target): {calculate_segment_value(df_binned, target_segment):.3f}")
print(f"Churn rate (Target): {100*(df_binned[target_segment]['Exited'].mean()):.2f}%")
print(f"Population Size (%): {100*sum(target_segment)/len(df_binned)}%")

# Predictive Model

In [None]:
# Load final model and unseen data set
model = joblib.load("../pipelines/final_model.joblib")

df_test = pd.read_csv("../data/clean/test.csv")
X_test, y_test = df_test.drop(['Exited'], axis=1), df_test['Exited']

For our model the class of predicted churners depend on the threshold chosen. We plot the value of the resulting class against the threshold.

In [None]:
# Predict class probabilities
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Calculate value and precision vs threshold plot
plot_threshold_analysis_grid(X_test, y_test, y_pred_proba)

- At a threshold of 0.3, we see that the number of customers predicted to be churners is slightly above the segement determine by rule-based heuristics as well as similar value. However we see that our model has about a 5% improvement in precision compared to the rule-based strategy. 
- In general we see that as we increase the threshold for our model the value and precision of both the resulting cummulative and binned segment of predicted customers increases.