In [1]:
# Cell 1: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import joblib
import warnings
warnings.filterwarnings("ignore")


In [2]:
# Cell 2: Load dataset
df = pd.read_csv(r"C:\Users\Muhammad Ahmed\churn-prediction\data\Telco-Customer-Churn.csv", encoding='utf-8')
df.head()


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
# Cell 3: Preprocess data

# Drop customerID
df.drop("customerID", axis=1, inplace=True)

# Convert TotalCharges to numeric, handle errors
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df.dropna(inplace=True)

# Convert target variable
df["Churn"] = df["Churn"].apply(lambda x: 1 if x == "Yes" else 0)

# Features and target
X = df.drop("Churn", axis=1)
y = df["Churn"]

# One-hot encode categorical variables
X = pd.get_dummies(X, drop_first=True)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)


In [4]:
# Cell 4: Train multiple models and evaluate

models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "KNN": KNeighborsClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier()
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    print(f"----- {name} -----")
    print("Accuracy:", acc)
    print(classification_report(y_test, y_pred))
    print()


----- Logistic Regression -----
Accuracy: 0.8038379530916845
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1033
           1       0.65      0.57      0.61       374

    accuracy                           0.80      1407
   macro avg       0.75      0.73      0.74      1407
weighted avg       0.80      0.80      0.80      1407


----- Random Forest -----
Accuracy: 0.7903340440653873
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1033
           1       0.63      0.51      0.56       374

    accuracy                           0.79      1407
   macro avg       0.73      0.70      0.71      1407
weighted avg       0.78      0.79      0.78      1407


----- Decision Tree -----
Accuracy: 0.7085998578535891
              precision    recall  f1-score   support

           0       0.80      0.80      0.80      1033
           1       0.45      0.46      0.45       374

    accur

In [6]:
import os
import joblib

# Absolute path to your models directory
models_dir = r"C:\Users\Muhammad Ahmed\churn-prediction\models"

# Make sure the directory exists
os.makedirs(models_dir, exist_ok=True)

# Save the best model and scaler
joblib.dump(best_model, os.path.join(models_dir, 'best_model.pkl'))
joblib.dump(scaler, os.path.join(models_dir, 'scaler.pkl'))

print(f"Best model: {best_model_name} saved at {models_dir}")


Best model: Logistic Regression saved at C:\Users\Muhammad Ahmed\churn-prediction\models


In [7]:
# Save feature names used in training
import joblib

feature_names = X.columns.tolist()
joblib.dump(feature_names, "../models/feature_names.pkl")  # Adjust path if needed


['../models/feature_names.pkl']