In [4]:
# telco_churn_pipeline.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from imblearn.over_sampling import SMOTE

import pickle

In [5]:
# 1. Load and clean dataset
df = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")

In [6]:
# Drop customerID since it is not useful for modeling
df.drop(columns=["customerID"], inplace=True)

# Replace blank spaces in TotalCharges with NaN and convert to float
df["TotalCharges"] = df["TotalCharges"].replace(" ", np.nan).astype(float)

# Impute missing TotalCharges with the median value
df["TotalCharges"] = df["TotalCharges"].fillna(df["TotalCharges"].median())

# Convert SeniorCitizen to string for consistent encoding
df["SeniorCitizen"] = df["SeniorCitizen"].astype(str)

# 2. Target encoding
df["Churn"] = df["Churn"].map({"Yes":1, "No":0})

In [7]:
# 3. Label encoding for categorical variables
categorical_cols = df.select_dtypes(include='object').columns.tolist()

encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

In [8]:
# Save encoders for inference
with open("encoders.pkl", "wb") as f_enc:
    pickle.dump(encoders, f_enc)

In [9]:
# 4. Prepare data for modeling
X = df.drop(columns=["Churn"])
y = df["Churn"]

# Stratified split to maintain class balance
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

In [10]:
# 5. Use SMOTE to balance classes in training data only
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [None]:
# 6. Define models to compare
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(random_state=42, eval_metric='logloss')  # removed use_label_encoder
}

print("Cross-validation accuracy scores:")
for name, model in models.items():
    scores = cross_val_score(model, X_train_smote, y_train_smote, cv=5, scoring="accuracy")
    print(f"{name}: {scores.mean():.4f} ± {scores.std():.4f}")

Cross-validation accuracy scores:
Decision Tree: 0.7888 ± 0.0607


In [None]:
# 7. Train final selected model (Random Forest here)
final_model = RandomForestClassifier(random_state=42)
final_model.fit(X_train_smote, y_train_smote)

In [None]:
# 8. Evaluate on the test set
y_pred = final_model.predict(X_test)
print("\nTest set performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification report:")
print(classification_report(y_test, y_pred))

In [None]:
# 9. Save the model and feature names for inference
model_package = {
    "model": final_model,
    "feature_names": X.columns.tolist()
}
with open("customer_churn_model.pkl", "wb") as f_mod:
    pickle.dump(model_package, f_mod)

In [None]:
# 10. Function to prepare inference input (with encoding and fallback for unseen categories)
def prepare_input(input_dict, encoders, feature_order):
    df_input = pd.DataFrame([input_dict])

    # Add missing columns if any (use 0 as default for numeric)
    for col in feature_order:
        if col not in df_input.columns:
            df_input[col] = 0

    # Reorder columns to model's feature order
    df_input = df_input[feature_order].copy()

    # Encode categorical variables with fallback for unseen categories
    for col, encoder in encoders.items():
        if col in df_input.columns:
            mask_unseen = ~df_input[col].isin(encoder.classes_)
            if mask_unseen.any():
                print(f"Warning: Unseen categories in column '{col}': {df_input.loc[mask_unseen, col].values}. Encoding fallback applied.")
                mode_class = encoder.classes_[0]
                df_input.loc[mask_unseen, col] = mode_class
            df_input[col] = encoder.transform(df_input[col])

    return df_input

# Example inference usage:
if __name__ == "__main__":
    # Load model and encoders
    with open("customer_churn_model.pkl", "rb") as f:
        model_data = pickle.load(f)
    with open("encoders.pkl", "rb") as f:
        encoders = pickle.load(f)

    loaded_model = model_data["model"]
    feature_names = model_data["feature_names"]

    example_input = {
        'gender': 'Female',
        'SeniorCitizen': '0',
        'Partner': 'Yes',
        'Dependents': 'No',
        'tenure': 1,
        'PhoneService': 'No',
        'MultipleLines': 'No phone service',
        'InternetService': 'DSL',
        'OnlineSecurity': 'No',
        'OnlineBackup': 'Yes',
        'DeviceProtection': 'No',
        'TechSupport': 'No',
        'StreamingTV': 'No',
        'StreamingMovies': 'No',
        'Contract': 'Month-to-month',
        'PaperlessBilling': 'Yes',
        'PaymentMethod': 'Electronic check',
        'MonthlyCharges': 29.85,
        'TotalCharges': 29.85
    }

    input_df = prepare_input(example_input, encoders, feature_names)

    prediction = loaded_model.predict(input_df)
    prediction_proba = loaded_model.predict_proba(input_df)

    print(f"Prediction: {'Churn' if prediction[0] == 1 else 'No Churn'}")
    print(f"Probabilities: {prediction_proba}")