In [3]:
# Cell 1: Imports and Setup
# Import necessary libraries
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (classification_report, confusion_matrix, roc_curve, 
                           roc_auc_score, precision_recall_curve, auc)

# Create output directory for saving files
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)

In [4]:
# Cell 2: Data Loading and Preprocessing
# Load the dataset
print("Loading the dataset...")
dataset_path = "TelcoCustomerChurn.csv"

# Read the CSV file
try:
    telco_data = pd.read_csv(dataset_path)
    print(f"Dataset successfully loaded with shape: {telco_data.shape}")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# Data Preprocessing
print("\nPreprocessing the data...")

# Convert 'TotalCharges' to numeric, replacing spaces with NaN
if telco_data['TotalCharges'].dtype == 'object':
    telco_data['TotalCharges'] = pd.to_numeric(telco_data['TotalCharges'], errors='coerce')
    telco_data['TotalCharges'].fillna(telco_data['TotalCharges'].mean(), inplace=True)

# Convert 'SeniorCitizen' from 0/1 to 'No'/'Yes' for consistent preprocessing
telco_data['SeniorCitizen'] = telco_data['SeniorCitizen'].map({0: 'No', 1: 'Yes'})

# Separate features and target variable
X = telco_data.drop(['customerID', 'Churn'], axis=1)
y = telco_data['Churn'].map({'Yes': 1, 'No': 0})  # Convert to binary

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"Categorical columns: {categorical_cols}")
print(f"Numerical columns: {numerical_cols}")

# Create preprocessors for both types of features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)  # Changed 'sparse' to 'sparse_output'
    ])

Loading the dataset...
Dataset successfully loaded with shape: (7043, 21)

Preprocessing the data...
Categorical columns: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']
Numerical columns: ['tenure', 'MonthlyCharges', 'TotalCharges']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  telco_data['TotalCharges'].fillna(telco_data['TotalCharges'].mean(), inplace=True)


TypeError: __init__() got an unexpected keyword argument 'sparse'

In [None]:
# Cell 3: Class Imbalance Handling
# Check for class imbalance
print("\nChecking for class imbalance...")
class_counts = y.value_counts()
print("Target class distribution:")
print(class_counts)
print(f"Class imbalance ratio (majority:minority): {class_counts[0]/class_counts[1]:.2f}:1")

# Split the data into training and testing sets (before handling imbalance)
X_train_original, X_test, y_train_original, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)

# Apply preprocessing to test data
X_test_processed = preprocessor.fit_transform(X_test)

# Method 1: Use SMOTE for oversampling the minority class
print("\nImplementing SMOTE oversampling...")
from imblearn.over_sampling import SMOTE

# Apply preprocessing to training data
X_train_processed_original = preprocessor.transform(X_train_original)

# Apply SMOTE to the processed training data
smote = SMOTE(random_state=42)
X_train_processed_smote, y_train_smote = smote.fit_resample(
    X_train_processed_original, y_train_original)

print("Class distribution after SMOTE:")
print(pd.Series(y_train_smote).value_counts())

# Method 2: Use class weights to handle imbalance
print("\nImplementing class weights...")
# Calculate class weights inversely proportional to class frequencies
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(
    class_weight='balanced', classes=np.unique(y_train_original), y=y_train_original)
class_weight_dict = {i: weight for i, weight in enumerate(class_weights)}
print(f"Class weights: {class_weight_dict}")

In [None]:
# Cell 4: Model Training
# Train models with different imbalance handling techniques
# 1. Baseline model (no imbalance handling)
baseline_model = RandomForestClassifier(random_state=42)
baseline_model.fit(X_train_processed_original, y_train_original)

# 2. SMOTE model
smote_model = RandomForestClassifier(random_state=42)
smote_model.fit(X_train_processed_smote, y_train_smote)

# 3. Class weights model
weighted_model = RandomForestClassifier(
    random_state=42, class_weight=class_weight_dict)
weighted_model.fit(X_train_processed_original, y_train_original)

In [None]:
# Cell 5: Custom Evaluation Metrics
print("\nImplementing custom evaluation metrics...")

def calculate_business_metrics(y_true, y_pred, y_prob=None, fn_cost=5, fp_cost=1):
    """
    Calculate business-oriented metrics for model evaluation.
    
    Parameters:
    -----------
    y_true : array-like
        True class labels
    y_pred : array-like
        Predicted class labels
    y_prob : array-like, optional
        Predicted probabilities for the positive class
    fn_cost : float, optional
        Cost of a false negative (missing a churner)
    fp_cost : float, optional
        Cost of a false positive (incorrectly predicting churn)
        
    Returns:
    --------
    dict
        Dictionary of business metrics
    """
    # Calculate confusion matrix elements
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # Calculate standard metrics
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    # Calculate business costs
    total_cost = (fn * fn_cost) + (fp * fp_cost)
    cost_per_customer = total_cost / len(y_true)
    
    # Calculate customer retention metrics
    retention_rate = tn / (tn + fn) if (tn + fn) > 0 else 0
    intervention_efficiency = tp / (tp + fp) if (tp + fp) > 0 else 0
    
    # Calculate profitability metrics (assuming average values)
    avg_customer_value = 1000  # Hypothetical average customer lifetime value
    avg_intervention_cost = 100  # Hypothetical cost of retention intervention
    
    # Potential savings from interventions
    potential_savings = tp * avg_customer_value - (tp + fp) * avg_intervention_cost
    
    # ROI of the churn prevention program
    roi = (potential_savings / ((tp + fp) * avg_intervention_cost)) if (tp + fp) > 0 else 0
    
    # Return all metrics in a dictionary
    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'false_negatives': fn,
        'false_positives': fp,
        'total_business_cost': total_cost,
        'cost_per_customer': cost_per_customer,
        'retention_rate': retention_rate,
        'intervention_efficiency': intervention_efficiency,
        'potential_savings': potential_savings,
        'roi': roi
    }
    
    return metrics

In [None]:
# Cell 5: Custom Evaluation Metrics
print("\nImplementing custom evaluation metrics...")

def calculate_business_metrics(y_true, y_pred, y_prob=None, fn_cost=5, fp_cost=1):
    """
    Calculate business-oriented metrics for model evaluation.
    
    Parameters:
    -----------
    y_true : array-like
        True class labels
    y_pred : array-like
        Predicted class labels
    y_prob : array-like, optional
        Predicted probabilities for the positive class
    fn_cost : float, optional
        Cost of a false negative (missing a churner)
    fp_cost : float, optional
        Cost of a false positive (incorrectly predicting churn)
        
    Returns:
    --------
    dict
        Dictionary of business metrics
    """
    # Calculate confusion matrix elements
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    # Calculate standard metrics
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    # Calculate business costs
    total_cost = (fn * fn_cost) + (fp * fp_cost)
    cost_per_customer = total_cost / len(y_true)
    
    # Calculate customer retention metrics
    retention_rate = tn / (tn + fn) if (tn + fn) > 0 else 0
    intervention_efficiency = tp / (tp + fp) if (tp + fp) > 0 else 0
    
    # Calculate profitability metrics (assuming average values)
    avg_customer_value = 1000  # Hypothetical average customer lifetime value
    avg_intervention_cost = 100  # Hypothetical cost of retention intervention
    
    # Potential savings from interventions
    potential_savings = tp * avg_customer_value - (tp + fp) * avg_intervention_cost
    
    # ROI of the churn prevention program
    roi = (potential_savings / ((tp + fp) * avg_intervention_cost)) if (tp + fp) > 0 else 0
    
    # Return all metrics in a dictionary
    metrics = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'false_negatives': fn,
        'false_positives': fp,
        'total_business_cost': total_cost,
        'cost_per_customer': cost_per_customer,
        'retention_rate': retention_rate,
        'intervention_efficiency': intervention_efficiency,
        'potential_savings': potential_savings,
        'roi': roi
    }
    
    return metrics

In [None]:
# Cell 6: Threshold Optimization
print("\nFinding optimal classification thresholds...")

def find_optimal_thresholds(model, X, y_true, metric_name='f1', fn_cost=5, fp_cost=1):
    """
    Find the optimal classification threshold based on various metrics.
    
    Parameters:
    -----------
    model : estimator
        Trained classifier with predict_proba method
    X : array-like
        Input features
    y_true : array-like
        True class labels
    metric_name : str, optional
        Metric to optimize ('f1', 'cost', 'precision', 'recall', 'roi')
    fn_cost : float, optional
        Cost of a false negative
    fp_cost : float, optional
        Cost of a false positive
        
    Returns:
    --------
    dict
        Dictionary with optimal thresholds for different metrics
    """
    # Get predicted probabilities
    y_probs = model.predict_proba(X)[:, 1]
    
    # Initialize variables
    thresholds = np.linspace(0.01, 0.99, 99)
    metrics = {
        'threshold': [],
        'accuracy': [],
        'precision': [],
        'recall': [],
        'f1': [],
        'business_cost': [],
        'roi': []
    }
    
    # Calculate metrics for each threshold
    for threshold in thresholds:
        y_pred = (y_probs >= threshold).astype(int)
        
        # Calculate standard metrics
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        # Calculate business cost
        business_cost = (fn * fn_cost) + (fp * fp_cost)
        
        # Calculate ROI (simplified)
        avg_customer_value = 1000
        avg_intervention_cost = 100
        potential_savings = tp * avg_customer_value - (tp + fp) * avg_intervention_cost
        roi = (potential_savings / ((tp + fp) * avg_intervention_cost)) if (tp + fp) > 0 else 0
        
        # Store results
        metrics['threshold'].append(threshold)
        metrics['accuracy'].append(accuracy)
        metrics['precision'].append(precision)
        metrics['recall'].append(recall)
        metrics['f1'].append(f1)
        metrics['business_cost'].append(business_cost)
        metrics['roi'].append(roi)
    
    # Find optimal thresholds
    results = {
        'accuracy': thresholds[np.argmax(metrics['accuracy'])],
        'precision': thresholds[np.argmax(metrics['precision'])],
        'recall': thresholds[np.argmax(metrics['recall'])],
        'f1': thresholds[np.argmax(metrics['f1'])],
        'business_cost': thresholds[np.argmin(metrics['business_cost'])],
        'roi': thresholds[np.argmax(metrics['roi'])]
    }
    
    # Return threshold based on specified metric
    if metric_name == 'cost':
        optimal_threshold = results['business_cost']
    else:
        optimal_threshold = results[metric_name]
    
    return {
        'optimal_threshold': optimal_threshold,
        'all_thresholds': results,
        'metrics': pd.DataFrame(metrics)
    }

# Find optimal thresholds for each model
thresholds_baseline = find_optimal_thresholds(
    baseline_model, X_test_processed, y_test, metric_name='f1')
thresholds_smote = find_optimal_thresholds(
    smote_model, X_test_processed, y_test, metric_name='f1')
thresholds_weighted = find_optimal_thresholds(
    weighted_model, X_test_processed, y_test, metric_name='f1')

# Print optimal thresholds
print("\nOptimal Thresholds (F1 Score):")
print(f"Baseline model: {thresholds_baseline['optimal_threshold']:.4f}")
print(f"SMOTE model: {thresholds_smote['optimal_threshold']:.4f}")
print(f"Weighted model: {thresholds_weighted['optimal_threshold']:.4f}")

# Also find optimal thresholds for business cost
thresholds_cost_baseline = find_optimal_thresholds(
    baseline_model, X_test_processed, y_test, metric_name='cost')
thresholds_cost_smote = find_optimal_thresholds(
    smote_model, X_test_processed, y_test, metric_name='cost')
thresholds_cost_weighted = find_optimal_thresholds(
    weighted_model, X_test_processed, y_test, metric_name='cost')

print("\nOptimal Thresholds (Business Cost):")
print(f"Baseline model: {thresholds_cost_baseline['optimal_threshold']:.4f}")
print(f"SMOTE model: {thresholds_cost_smote['optimal_threshold']:.4f}")
print(f"Weighted model: {thresholds_cost_weighted['optimal_threshold']:.4f}")

In [None]:
# Cell 7: Custom Visualizations
# TODO: Create visualizations to understand model performance from a business perspective
# Suggestions:
# 1. Plot business cost vs. threshold for each model
# 2. Plot ROI vs. threshold for each model

print("\nCreating custom visualizations...")

# Example placeholder: Plot business cost vs. threshold
plt.figure(figsize=(10, 6))
plt.plot(thresholds_baseline['metrics']['threshold'], thresholds_baseline['metrics']['business_cost'], label='Baseline Model')
plt.plot(thresholds_smote['metrics']['threshold'], thresholds_smote['metrics']['business_cost'], label='SMOTE Model')
plt.plot(thresholds_weighted['metrics']['threshold'], thresholds_weighted['metrics']['business_cost'], label='Weighted Model')
plt.xlabel('Threshold')
plt.ylabel('Business Cost')
plt.title('Business Cost vs. Classification Threshold')
plt.legend()
plt.grid(True)
plt.savefig(os.path.join(output_dir, 'business_cost_vs_threshold.png'))
plt.show()

# Add more visualizations as needed