In [5]:
"""
Handcrafted Churn Predictor — Clean, Explainable, Effective
- Focus: meaningful features, interpretability, good accuracy
- Models: LogisticRegression, RandomForest, GradientBoosting + soft Voting
- Feature selection: SelectKBest (top 20)
- Optional imbalance handling (SMOTE) only if needed
- Outputs: saved model components + churn_predictions.csv + short business insights
"""

import warnings
warnings.filterwarnings("ignore")

import os
import pickle
from typing import List

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, RobustScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, classification_report)

# Optional: for imbalance handling if class imbalance detected
try:
    from imblearn.combine import SMOTETomek
    IMBL_AVAILABLE = True
except Exception:
    IMBL_AVAILABLE = False


class HandcraftedChurnPredictor:
    """
    A compact, purposeful churn pipeline:
      - clean + small set of explainable features
      - encode, split, optional SMOTE
      - SelectKBest (top 20)
      - scale, train three models + voting ensemble
      - evaluate, save, export actionable predictions
    """

    def __init__(self, data_path: str, k_best: int = 20, random_state: int = 42):
        self.data_path = data_path
        self.random_state = random_state
        self.k_best = k_best
        self.df = None
        self.label_encoders = {}
        self.scaler = RobustScaler()
        self.feature_selector = None
        self.selected_features: List[str] = []
        self.best_model = None
        self.results = {}
        

    # ------------------------
    # 1. Load & basic checks
    # ------------------------
    def load_data(self):
        print("\n" + "=" * 60)
        print("1) LOADING DATA & BASIC CHECKS")
        print("=" * 60)
        self.df = pd.read_csv(self.data_path)
        print(f"Loaded dataset with shape: {self.df.shape}")


        # check churn column presence
        if 'Churn' not in self.df.columns:
            raise ValueError("Dataset must contain a 'Churn' column.")

        # simple class balance check
        churn_dist = self.df['Churn'].value_counts(normalize=True)
        print("\nClass distribution (normalized):")
        print(churn_dist.to_dict())
        imbalance_ratio = churn_dist.max() / churn_dist.min()
        print(f"Imbalance ratio: {imbalance_ratio:.2f}:1")

        # decide whether to attempt imbalance handling
        self.apply_smote = imbalance_ratio > 1.5 and IMBL_AVAILABLE
        if imbalance_ratio > 1.5 and not IMBL_AVAILABLE:
            print("Imbalance detected but imblearn not available — skipping SMOTE.")
            self.apply_smote = False
        elif self.apply_smote:
            print("⚠️  Class imbalance detected — SMOTETomek will be used on training set.")
        else:
            print("Class balance acceptable — proceeding without SMOTE.")

        return self

    # ------------------------
    # 2. Feature engineering (human-focused)
    # ------------------------
    def feature_engineering(self):
        print("\n" + "=" * 60)
        print("2) FEATURE ENGINEERING — focused & interpretable")
        print("=" * 60)

        df = self.df.copy()

        # keep original customer id if present for export
        self.customer_id_col = None
        for cid in ['customerID', 'customerId', 'CustomerID']:
            if cid in df.columns:
                self.customer_id_col = cid
                break

        # convert numeric-ish columns
        if 'TotalCharges' in df.columns:
            df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

        # fill sensible missing TotalCharges using MonthlyCharges * tenure
        if set(['TotalCharges', 'MonthlyCharges', 'tenure']).issubset(df.columns):
            df['TotalCharges'] = df['TotalCharges'].fillna(df['MonthlyCharges'] * df['tenure'])

        # core handcrafted features — each should be explainable to a manager
        # we'll create only a tight set (approx 10-12) to keep the model clean
        # check presence of columns before using them (robustness)
        def col_exists(c): return c in df.columns

        # Avg monthly charge (proxy for value)
        if col_exists('TotalCharges') and col_exists('tenure'):
            df['AvgMonthlyCharges'] = df['TotalCharges'] / (df['tenure'] + 1)
        elif col_exists('MonthlyCharges'):
            df['AvgMonthlyCharges'] = df['MonthlyCharges']

        # Is new customer
        if col_exists('tenure'):
            df['IsNewCustomer'] = (df['tenure'] <= 12).astype(int)
            df['IsLoyalCustomer'] = (df['tenure'] > 36).astype(int)
        else:
            df['IsNewCustomer'] = 0
            df['IsLoyalCustomer'] = 0

        # Contract risk
        if col_exists('Contract'):
            df['IsMonthToMonth'] = (df['Contract'] == 'Month-to-month').astype(int)
            contract_value = {'Month-to-month': 1, 'One year': 12, 'Two year': 24}
            df['ContractValue'] = df['Contract'].map(contract_value).fillna(1)
        else:
            df['IsMonthToMonth'] = 0
            df['ContractValue'] = 1

        # Services count (robust count of key services)
        service_cols = [c for c in ['PhoneService', 'MultipleLines', 'InternetService',
                                    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
                                    'TechSupport', 'StreamingTV', 'StreamingMovies'] if c in df.columns]
        if service_cols:
            # treat 'No' or similar as not having
            df['TotalServices'] = 0
            for c in service_cols:
                df['TotalServices'] += (~df[c].isin(['No', 'No phone service', 'No internet service', 'None'])).astype(int)
        else:
            df['TotalServices'] = 0

        # Electronic payment risk
        if col_exists('PaymentMethod'):
            df['IsElectronicPayment'] = (df['PaymentMethod'] == 'Electronic check').astype(int)
        else:
            df['IsElectronicPayment'] = 0

        # Paperless billing flag
        if col_exists('PaperlessBilling'):
            df['PaperlessBillingFlag'] = (df['PaperlessBilling'] == 'Yes').astype(int)
        else:
            df['PaperlessBillingFlag'] = 0

        # High charge & new customer: potential early churn cause
        if col_exists('MonthlyCharges') and col_exists('tenure'):
            median_monthly = df['MonthlyCharges'].median()
            df['HighChargeNewCustomer'] = ((df['MonthlyCharges'] > median_monthly) & (df['tenure'] < 12)).astype(int)
        else:
            df['HighChargeNewCustomer'] = 0

        # Senior alone (simple demographic risk signal)
        if set(['SeniorCitizen', 'Partner', 'Dependents']).issubset(df.columns):
            df['SeniorAlone'] = ((df['SeniorCitizen'] == 1) & (df['Partner'] == 'No') & (df['Dependents'] == 'No')).astype(int)
        else:
            df['SeniorAlone'] = 0

        # Engagement score (services * log(tenure + 1))
        if col_exists('tenure'):
            df['EngagementScore'] = df['TotalServices'] * np.log1p(df['tenure'])
        else:
            df['EngagementScore'] = df['TotalServices']

        # Composite risk score (simple interpretable linear sum)
        df['CompositeRiskScore'] = (
            df['IsMonthToMonth'] * 3 +
            df['IsNewCustomer'] * 2 +
            df['IsElectronicPayment'] * 1 +
            df['SeniorAlone'] * 2 +
            df['HighChargeNewCustomer'] * 1
        )

        # Keep a curated list of feature candidates only (explainable ones)
        candidate_features = []
        for col in ['AvgMonthlyCharges', 'MonthlyCharges', 'TotalCharges', 'tenure',
                    'IsNewCustomer', 'IsLoyalCustomer', 'IsMonthToMonth', 'ContractValue',
                    'TotalServices', 'IsElectronicPayment', 'PaperlessBillingFlag',
                    'HighChargeNewCustomer', 'SeniorAlone', 'EngagementScore', 'CompositeRiskScore']:
            if col in df.columns:
                candidate_features.append(col)

        # Fill any remaining missing values with medians or zeros (simple, human)
        for col in candidate_features:
            if df[col].isnull().any():
                if df[col].dtype.kind in "biufc":
                    df[col] = df[col].fillna(df[col].median())
                else:
                    df[col] = df[col].fillna(0)

        # reduce df to candidate features + target + id (keep other columns dropped to avoid accidental leakage)
        columns_to_keep = candidate_features + ['Churn']
        if self.customer_id_col:
            columns_to_keep = [self.customer_id_col] + columns_to_keep

        self.df = df.loc[:, [c for c in columns_to_keep if c in df.columns]].copy()

        print(f"Kept {len(candidate_features)} handcrafted features for modeling.")
        self.candidate_features = candidate_features
        return self

    # ------------------------
    # 3. Encode, split
    # ------------------------
    def encode_and_split(self, test_size=0.2):
        print("\n" + "=" * 60)
        print("3) ENCODE & SPLIT")
        print("=" * 60)

        df = self.df.copy()

        # Encode any remaining object columns (should be minimal)
        object_cols = df.select_dtypes(include='object').columns.tolist()
        object_cols = [c for c in object_cols if c != 'Churn' and (self.customer_id_col is None or c != self.customer_id_col)]
        for col in object_cols:
            le = LabelEncoder()
            df[col] = le.fit_transform(df[col].astype(str))
            self.label_encoders[col] = le

        # Encode target
        le_target = LabelEncoder()
        df['Churn'] = le_target.fit_transform(df['Churn'])
        self.label_encoders['Churn'] = le_target

        # Split
        X = df.drop(columns=['Churn'] + ([self.customer_id_col] if self.customer_id_col else []))
        y = df['Churn']

        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=test_size, stratify=y, random_state=self.random_state
        )

        print(f"Train shape: {self.X_train.shape} | Test shape: {self.X_test.shape}")
        return self

    # ------------------------
    # 4. Optional imbalance handling
    # ------------------------
    def handle_class_imbalance(self):
        if not getattr(self, "apply_smote", False):
            print("\n  Skipping imbalance handling (not needed or not available).")
            return self

        if not IMBL_AVAILABLE:
            print("\n imblearn not available — skipping SMOTE.")
            return self

        print("\n" + "=" * 60)
        print("4) APPLY SMOTETOMEK (TRAINING SET ONLY)")
        print("=" * 60)

        smote_tomek = SMOTETomek(random_state=self.random_state)
        X_res, y_res = smote_tomek.fit_resample(self.X_train, self.y_train)
        self.X_train, self.y_train = X_res, y_res
        print("After resampling, training class distribution:", pd.Series(self.y_train).value_counts().to_dict())
        return self

    # ------------------------
    # 5. Feature selection (SelectKBest)
    # ------------------------
    def feature_selection(self):
        print("\n" + "=" * 60)
        print("5) FEATURE SELECTION (SelectKBest)")
        print("=" * 60)

        n_features = min(self.k_best, self.X_train.shape[1])
        selector = SelectKBest(score_func=f_classif, k=n_features)
        X_train_sel = selector.fit_transform(self.X_train, self.y_train)
        X_test_sel = selector.transform(self.X_test)

        # get selected column names
        selected_idx = selector.get_support(indices=True)
        self.selected_features = self.X_train.columns[selected_idx].tolist()
        self.feature_selector = selector

        # update train/test to selected features
        self.X_train = pd.DataFrame(X_train_sel, columns=self.selected_features, index=self.X_train.index)
        self.X_test = pd.DataFrame(X_test_sel, columns=self.selected_features, index=self.X_test.index)

        print(f"Selected top {len(self.selected_features)} features.")
        print("Top features:", self.selected_features[:10])
        return self

    # ------------------------
    # 6. Scale
    # ------------------------
    def scale_features(self):
        print("\n" + "=" * 60)
        print("6) SCALE FEATURES (RobustScaler)")
        print("=" * 60)

        self.X_train_scaled = self.scaler.fit_transform(self.X_train)
        self.X_test_scaled = self.scaler.transform(self.X_test)
        return self

    # ------------------------
    # 7. Train core models + voting
    # ------------------------
    def train_and_ensemble(self):
        print("\n" + "=" * 60)
        print("7) TRAINING — 3 MODELS + SOFT VOTING")
        print("=" * 60)

        # simple, explainable models
        lr = LogisticRegression(max_iter=1000, C=1.0, random_state=self.random_state)
        rf = RandomForestClassifier(n_estimators=200, max_depth=12, n_jobs=-1, random_state=self.random_state)
        gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.05, max_depth=5, random_state=self.random_state)

        # cross-validate each model (roc_auc)
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.random_state)
        models = {'LogisticRegression': lr, 'RandomForest': rf, 'GradientBoosting': gb}

        trained = {}
        results = {}
        for name, model in models.items():
            print(f"\nTraining {name} ...")
            try:
                cv_scores = cross_val_score(model, self.X_train_scaled, self.y_train, cv=cv, scoring='roc_auc', n_jobs=-1)
                print(f"  CV ROC AUC mean: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")
            except Exception:
                print("  CV failed or skipped for speed.")
            model.fit(self.X_train_scaled, self.y_train)
            y_pred = model.predict(self.X_test_scaled)
            y_proba = model.predict_proba(self.X_test_scaled)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(self.X_test_scaled)
            results[name] = {
                'accuracy': accuracy_score(self.y_test, y_pred),
                'precision': precision_score(self.y_test, y_pred, zero_division=0),
                'recall': recall_score(self.y_test, y_pred, zero_division=0),
                'f1': f1_score(self.y_test, y_pred, zero_division=0),
                'auc': roc_auc_score(self.y_test, y_proba)
            }
            trained[name] = model
            print(f"  Test AUC: {results[name]['auc']:.4f} | F1: {results[name]['f1']:.4f}")

        # Voting ensemble (soft)
        voting = VotingClassifier(
            estimators=[('lr', trained['LogisticRegression']), ('rf', trained['RandomForest']), ('gb', trained['GradientBoosting'])],
            voting='soft', n_jobs=-1
        )
        voting.fit(self.X_train_scaled, self.y_train)
        y_pred_vote = voting.predict(self.X_test_scaled)
        y_proba_vote = voting.predict_proba(self.X_test_scaled)[:, 1]
        results['VotingEnsemble'] = {
            'accuracy': accuracy_score(self.y_test, y_pred_vote),
            'precision': precision_score(self.y_test, y_pred_vote, zero_division=0),
            'recall': recall_score(self.y_test, y_pred_vote, zero_division=0),
            'f1': f1_score(self.y_test, y_pred_vote, zero_division=0),
            'auc': roc_auc_score(self.y_test, y_proba_vote)
        }
        print("\nVoting ensemble results:")
        print(f"  AUC: {results['VotingEnsemble']['auc']:.4f} | F1: {results['VotingEnsemble']['f1']:.4f}")

        # pick best by AUC
        best_name = max(results, key=lambda k: results[k]['auc'])
        print(f"\Best model by AUC: {best_name}")
        self.results = results
        self.best_model = voting if best_name == 'VotingEnsemble' else trained[best_name]
        self.all_models = trained
        self.model_name = best_name
        return self

    # ------------------------
    # 8. Evaluate + Insights
    # ------------------------
    def evaluate_and_report(self):
        print("\n" + "=" * 60)
        print("8) EVALUATION & BUSINESS INSIGHTS")
        print("=" * 60)

        model = self.best_model
        y_pred = model.predict(self.X_test_scaled)
        y_proba = model.predict_proba(self.X_test_scaled)[:, 1]

        # confusion matrix and derived metrics
        cm = confusion_matrix(self.y_test, y_pred)
        tn, fp, fn, tp = cm.ravel()
        specificity = tn / (tn + fp) if (tn + fp) else 0
        sensitivity = tp / (tp + fn) if (tp + fn) else 0

        print(f"\nConfusion Matrix:\n{cm}")
        print(f"Sensitivity (Recall): {sensitivity:.4f}")
        print(f"Specificity: {specificity:.4f}")

        print("\nClassification Report:")
        print(classification_report(self.y_test, y_pred, target_names=['No Churn', 'Churn']))

        # print compact metrics summary
        best_res = self.results.get(self.model_name, {})
        print("\nModel summary (selected model):")
        print(f"  Accuracy: {best_res.get('accuracy', 0):.4f}")
        print(f"  Precision: {best_res.get('precision', 0):.4f}")
        print(f"  Recall: {best_res.get('recall', 0):.4f}")
        print(f"  F1: {best_res.get('f1', 0):.4f}")
        print(f"  ROC AUC: {best_res.get('auc', 0):.4f}")

        # Simple human-readable insights (based on feature effects we engineered)
        print("\n BUSINESS INSIGHTS (human-friendly):")
        print("- Month-to-month contract customers are highest risk — target them with offers.")
        print("- New customers (<12 months) with high monthly charges churn faster.")
        print("- Customers with more services and higher engagement scores show lower churn risk.")
        print("- Electronic-check payments correlate with higher churn probability (payment friction).")

        return self

    # ------------------------
    # 9. Save artifacts + export predictions CSV
    # ------------------------
    def save_artifacts_and_export(self, out_dir="output"):
        print("\n" + "=" * 60)
        print("9) SAVE ARTIFACTS & EXPORT PREDICTIONS")
        print("=" * 60)

        os.makedirs(out_dir, exist_ok=True)

        # save model + scaler + encoders + feature selector + selected features
        with open(os.path.join(out_dir, "model.pkl"), "wb") as f:
            pickle.dump(self.best_model, f)
        with open(os.path.join(out_dir, "scaler.pkl"), "wb") as f:
            pickle.dump(self.scaler, f)
        with open(os.path.join(out_dir, "encoders.pkl"), "wb") as f:
            pickle.dump(self.label_encoders, f)
        with open(os.path.join(out_dir, "feature_selector.pkl"), "wb") as f:
            pickle.dump(self.feature_selector, f)
        with open(os.path.join(out_dir, "selected_features.txt"), "w") as f:
            for feat in self.selected_features:
                f.write(feat + "\n")

        print(f"Saved model and artifacts to {out_dir}/")

        # export test predictions with probabilities and customer IDs if present
        X_test_df = self.X_test.copy()
        preds = self.best_model.predict(self.X_test_scaled)
        probs = self.best_model.predict_proba(self.X_test_scaled)[:, 1]

        export_df = pd.DataFrame({
            'PredictedChurn': preds,
            'ChurnProbability': probs
        }, index=self.X_test.index)

        # map predicted labels back to original (if label encoder used)
        if 'Churn' in self.label_encoders:
            le = self.label_encoders['Churn']
            try:
                export_df['PredictedChurnLabel'] = le.inverse_transform(export_df['PredictedChurn'])
            except Exception:
                export_df['PredictedChurnLabel'] = export_df['PredictedChurn']

        # attach customer ID if available from original df
        full_df = pd.read_csv(self.data_path)
        if self.customer_id_col and self.customer_id_col in full_df.columns:
            # align using the original indices — safe because we never shuffled the original df's index during split
            id_series = full_df.loc[self.X_test.index, self.customer_id_col] if self.X_test.index.isin(full_df.index).all() else full_df[self.customer_id_col]
            export_df.insert(0, self.customer_id_col, id_series.values[:len(export_df)])

        export_path = os.path.join(out_dir, "churn_predictions.csv")
        export_df.to_csv(export_path, index=False)
        print(f"Exported predictions to {export_path}")

        return self


def main():
    DATA_PATH = "C:\\pro m\\ml pro\\WA_Fn-UseC_-Telco-Customer-Churn.csv"  # replace with your actual file if different

    predictor = HandcraftedChurnPredictor(DATA_PATH, k_best=20)
    (predictor
     .load_data()
     .feature_engineering()
     .encode_and_split()
     .handle_class_imbalance()
     .feature_selection()
     .scale_features()
     .train_and_ensemble()
     .evaluate_and_report()
     .save_artifacts_and_export(out_dir="output_handcrafted"))
    print("\n Pipeline finished. Check output_handcrafted/ for artifacts and churn_predictions.csv")


if __name__ == "__main__":
    main()



1) LOADING DATA & BASIC CHECKS
Loaded dataset with shape: (7043, 21)

Class distribution (normalized):
{'No': 0.7346301292063041, 'Yes': 0.2653698707936959}
Imbalance ratio: 2.77:1
⚠️  Class imbalance detected — SMOTETomek will be used on training set.

2) FEATURE ENGINEERING — focused & interpretable
Kept 15 handcrafted features for modeling.

3) ENCODE & SPLIT
Train shape: (5634, 15) | Test shape: (1409, 15)

4) APPLY SMOTETOMEK (TRAINING SET ONLY)
After resampling, training class distribution: {0: 3843, 1: 3843}

5) FEATURE SELECTION (SelectKBest)
Selected top 15 features.
Top features: ['AvgMonthlyCharges', 'MonthlyCharges', 'TotalCharges', 'tenure', 'IsNewCustomer', 'IsLoyalCustomer', 'IsMonthToMonth', 'ContractValue', 'TotalServices', 'IsElectronicPayment']

6) SCALE FEATURES (RobustScaler)

7) TRAINING — 3 MODELS + SOFT VOTING

Training LogisticRegression ...
  CV ROC AUC mean: 0.8909 (+/- 0.0035)
  Test AUC: 0.8307 | F1: 0.6137

Training RandomForest ...
  CV ROC AUC mean: 0.9