## Default estimator and WoE binning (Task 3)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from scipy.stats import boxcox
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the data
df = pd.read_csv("data.csv")
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

In [None]:
# 1. Construct a default estimator (proxy)
# Calculate RFMS features
def calculate_rfms(df):
    # Set the last date of the dataset
    last_date = df['TransactionStartTime'].max()
    
    rfms = df.groupby('CustomerId').agg({
        'TransactionStartTime': lambda x: (last_date - x.max()).days,  # Recency
        'TransactionId': 'count',  # Frequency
        'Amount': 'sum',  # Monetary
        'Amount': 'std'  # Standardization (using std of Amount as a simple proxy)
    })
    
    rfms.columns = ['Recency', 'Frequency', 'Monetary', 'Standardization']
    return rfms

rfms = calculate_rfms(df)

In [None]:
# Normalize RFMS features
scaler = StandardScaler()
rfms_normalized = pd.DataFrame(scaler.fit_transform(rfms), columns=rfms.columns, index=rfms.index)

In [None]:
# Calculate RFMS score (simple average of normalized features)
rfms_normalized['RFMS_Score'] = rfms_normalized.mean(axis=1)

In [None]:
# Visualize RFMS score distribution
plt.figure(figsize=(10, 6))
sns.histplot(rfms_normalized['RFMS_Score'], kde=True)
plt.title('Distribution of RFMS Scores')
plt.xlabel('RFMS Score')
plt.show()

In [None]:
# Define threshold for good/bad classification (e.g., median)
threshold = rfms_normalized['RFMS_Score'].median()
rfms_normalized['Label'] = np.where(rfms_normalized['RFMS_Score'] >= threshold, 'Good', 'Bad')

In [None]:
print("RFMS scores and labels:")
print(rfms_normalized.head())

In [None]:
# Visualize users in RFMS space
plt.figure(figsize=(12, 10))
sns.scatterplot(data=rfms_normalized, x='Recency', y='Monetary', hue='Label', size='Frequency', sizes=(20, 200))
plt.title('Users in RFMS Space')
plt.show()

In [None]:
# 2. Perform Weight of Evidence (WoE) binning
def calculate_woe_iv(df, feature, target, bins=10):
    df = df.copy()
    
    if df[feature].dtype == 'O':
        df['bins'] = df[feature]
    else:
        df['bins'] = pd.qcut(df[feature], q=bins, duplicates='drop')
    
    grouped = df.groupby('bins')[target].agg(['count', 'sum'])
    grouped['non_event'] = grouped['count'] - grouped['sum']
    grouped['percent_event'] = grouped['sum'] / grouped['sum'].sum()
    grouped['percent_non_event'] = grouped['non_event'] / grouped['non_event'].sum()
    
    grouped['WoE'] = np.log(grouped['percent_event'] / grouped['percent_non_event'])
    grouped['IV'] = (grouped['percent_event'] - grouped['percent_non_event']) * grouped['WoE']
    
    iv = grouped['IV'].sum()
    
    return grouped, iv

In [None]:
# Perform WoE binning for each RFMS feature
target = 'Label'
rfms_normalized[target] = rfms_normalized[target].map({'Good': 1, 'Bad': 0})

for feature in ['Recency', 'Frequency', 'Monetary', 'Standardization']:
    woe_grouped, iv = calculate_woe_iv(rfms_normalized, feature, target)
    
    print(f"\nWeight of Evidence for {feature}:")
    print(woe_grouped[['count', 'WoE']])
    print(f"Information Value: {iv}")
    
    # Visualize WoE
    plt.figure(figsize=(10, 6))
    woe_grouped['WoE'].plot(kind='bar')
    plt.title(f'Weight of Evidence - {feature}')
    plt.xlabel('Bins')
    plt.ylabel('Weight of Evidence')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

Key observations and next steps:

RFMS Classification: The script classifies users as 'Good' or 'Bad' based on their RFMS score. You may need to adjust the threshold based on domain knowledge or business requirements.
Visualization: The scatter plot in RFMS space helps visualize how users are distributed and classified. This can provide insights into the effectiveness of the classification.
WoE Binning: The Weight of Evidence shows how different bins of each feature contribute to distinguishing between 'Good' and 'Bad' customers. Positive WoE indicates higher odds of being a 'Good' customer, while negative WoE indicates higher odds of being a 'Bad' customer.
Information Value (IV): The IV provides a measure of the predictive power of each feature. Generally:
< 0.02: Unpredictive
0.02 to 0.1: Weak
0.1 to 0.3: Medium
0.3: Strong
Feature Selection: Based on the IV values, you can select the most predictive features for your credit scoring model.
Model Development: You can now use these WoE-transformed features to develop your credit scoring model. Logistic Regression is often used with WoE features due to its interpretability.
Validation: Ensure to validate your model on a separate test set and consider using cross-validation for more robust performance estimation.
Monitoring: Once deployed, regularly monitor the model's performance and update it as needed, as the relationships between features and creditworthiness may change over time.