In [1]:
# Install required packages if needed
# pip install scikit-learn pandas numpy joblib

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the dataset (replace with your file path)
# Dataset can be downloaded from: https://www.kaggle.com/blastchar/telco-customer-churn
df = pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nMissing values:\n", df.isnull().sum())
print("\nData types:\n", df.dtypes)

Dataset shape: (7043, 21)

Columns: ['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']

Missing values:
 customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

Data types:
 customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           

In [3]:
# Clean the data
def preprocess_data(df):
    # Create a copy to avoid modifying the original
    df_clean = df.copy()
    
    # Convert TotalCharges to numeric, handling errors
    df_clean['TotalCharges'] = pd.to_numeric(df_clean['TotalCharges'], errors='coerce')
    
    # Drop customerID as it's not useful for prediction
    df_clean = df_clean.drop('customerID', axis=1)
    
    # Handle missing values in TotalCharges (replace with median)
    df_clean['TotalCharges'].fillna(df_clean['TotalCharges'].median(), inplace=True)
    
    # Convert target variable to binary
    df_clean['Churn'] = df_clean['Churn'].map({'Yes': 1, 'No': 0})
    
    return df_clean

# Preprocess the data
df_clean = preprocess_data(df)

# Separate features and target
X = df_clean.drop('Churn', axis=1)
y = df_clean['Churn']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")
print(f"Churn distribution in training: {y_train.value_counts(normalize=True)}")

Training set size: (5634, 19)
Test set size: (1409, 19)
Churn distribution in training: Churn
0    0.734647
1    0.265353
Name: proportion, dtype: float64


In [4]:
# Identify numerical and categorical columns
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_cols = [col for col in X.columns if col not in numerical_cols]

print("Numerical columns:", numerical_cols)
print("Categorical columns:", categorical_cols)

Numerical columns: ['tenure', 'MonthlyCharges', 'TotalCharges']
Categorical columns: ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod']


In [5]:
# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Handle any remaining missing values
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [6]:
# Logistic Regression Pipeline
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, max_iter=1000))
])

# Random Forest Pipeline
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [7]:
# Define parameter grids for GridSearch
lr_param_grid = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__penalty': ['l2', 'none'],
    'classifier__solver': ['lbfgs', 'sag']
}

rf_param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5],
    'classifier__min_samples_leaf': [1, 2]
}

# Perform GridSearch for Logistic Regression
print("Tuning Logistic Regression...")
lr_grid_search = GridSearchCV(
    lr_pipeline, 
    lr_param_grid, 
    cv=5, 
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
lr_grid_search.fit(X_train, y_train)

# Perform GridSearch for Random Forest
print("Tuning Random Forest...")
rf_grid_search = GridSearchCV(
    rf_pipeline, 
    rf_param_grid, 
    cv=5, 
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
rf_grid_search.fit(X_train, y_train)

# Get best models
best_lr = lr_grid_search.best_estimator_
best_rf = rf_grid_search.best_estimator_

print(f"Best Logistic Regression params: {lr_grid_search.best_params_}")
print(f"Best Random Forest params: {rf_grid_search.best_params_}")

Tuning Logistic Regression...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
Tuning Random Forest...
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best Logistic Regression params: {'classifier__C': 10, 'classifier__penalty': 'l2', 'classifier__solver': 'lbfgs'}
Best Random Forest params: {'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}


In [8]:
def evaluate_model(model, X_test, y_test, model_name):
    """Evaluate model performance"""
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    print(f"\n{model_name} Performance:")
    print("=" * 50)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    return accuracy_score(y_test, y_pred)

# Evaluate both models
lr_accuracy = evaluate_model(best_lr, X_test, y_test, "Logistic Regression")
rf_accuracy = evaluate_model(best_rf, X_test, y_test, "Random Forest")

# Select the best model
if lr_accuracy > rf_accuracy:
    best_model = best_lr
    print("\nSelected Logistic Regression as the best model")
else:
    best_model = best_rf
    print("\nSelected Random Forest as the best model")


Logistic Regression Performance:
Accuracy: 0.8055

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.56      0.60       374

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409


Confusion Matrix:
[[926 109]
 [165 209]]

Random Forest Performance:
Accuracy: 0.8062

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      1035
           1       0.68      0.52      0.59       374

    accuracy                           0.81      1409
   macro avg       0.76      0.71      0.73      1409
weighted avg       0.80      0.81      0.80      1409


Confusion Matrix:
[[943  92]
 [181 193]]

Selected Random Forest as the best model


In [9]:
# Export the best model pipeline
joblib.dump(best_model, 'customer_churn_pipeline.joblib')
print("Pipeline exported successfully as 'customer_churn_pipeline.joblib'")

# Also export the preprocessor separately for potential reuse
joblib.dump(preprocessor, 'churn_preprocessor.joblib')
print("Preprocessor exported successfully as 'churn_preprocessor.joblib'")

Pipeline exported successfully as 'customer_churn_pipeline.joblib'
Preprocessor exported successfully as 'churn_preprocessor.joblib'


In [11]:
# Example of how to use the exported pipeline in production
def predict_churn(new_customer_data, pipeline_path='customer_churn_pipeline.joblib'):
    """
    Predict churn for new customer data
    """
    # Load the pipeline
    pipeline = joblib.load(pipeline_path)
    
    # Ensure data is in DataFrame format with correct column names
    if isinstance(new_customer_data, dict):
        new_customer_data = pd.DataFrame([new_customer_data])
    
    # Make prediction
    prediction = pipeline.predict(new_customer_data)
    prediction_proba = pipeline.predict_proba(new_customer_data)
    
    # Return results
    result = {
        'churn_prediction': 'Yes' if prediction[0] == 1 else 'No',
        'churn_probability': float(prediction_proba[0][1]),
        'confidence': 'High' if max(prediction_proba[0]) > 0.7 else 'Medium'
    }
    
    return result

# Example usage
sample_customer = {
    'gender': 'Female',
    'SeniorCitizen': 0,
    'Partner': 'Yes',
    'Dependents': 'No',
    'tenure': 12,
    'PhoneService': 'Yes',
    'MultipleLines': 'No',
    'InternetService': 'DSL',
    'OnlineSecurity': 'No',
    'OnlineBackup': 'Yes',
    'DeviceProtection': 'No',
    'TechSupport': 'No',
    'StreamingTV': 'No',
    'StreamingMovies': 'No',
    'Contract': 'Month-to-month',
    'PaperlessBilling': 'Yes',
    'PaymentMethod': 'Electronic check',
    'MonthlyCharges': 29.85,
    'TotalCharges': 358.2
}

# Make prediction
prediction_result = predict_churn(sample_customer)
print("\nPrediction for sample customer:")
for key, value in prediction_result.items():
    print(f"{key}: {value}")


Prediction for sample customer:
churn_prediction: No
churn_probability: 0.26341281180570475
confidence: High


In [12]:
# Create a comprehensive pipeline class for better reusability
class ChurnPredictionPipeline:
    def __init__(self, pipeline_path='customer_churn_pipeline.joblib'):
        self.pipeline = joblib.load(pipeline_path)
        self.feature_names = None
        
    def predict(self, data):
        """Make predictions"""
        if isinstance(data, dict):
            data = pd.DataFrame([data])
        return self.pipeline.predict(data)
    
    def predict_proba(self, data):
        """Get prediction probabilities"""
        if isinstance(data, dict):
            data = pd.DataFrame([data])
        return self.pipeline.predict_proba(data)
    
    def get_feature_importance(self):
        """Get feature importance if available"""
        if hasattr(self.pipeline.named_steps['classifier'], 'feature_importances_'):
            # For tree-based models
            return self.pipeline.named_steps['classifier'].feature_importances_
        elif hasattr(self.pipeline.named_steps['classifier'], 'coef_'):
            # For linear models
            return self.pipeline.named_steps['classifier'].coef_[0]
        else:
            return None

# Usage example
churn_predictor = ChurnPredictionPipeline()
result = churn_predictor.predict(sample_customer)
print(f"Prediction: {result[0]}")

Prediction: 0
