In [4]:
import pandas as pd

# Load the full dataset
df = pd.read_csv("telco_churn.csv")

# Clean TotalCharges column (some values may be blank)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df.dropna(inplace=True)

# Preview
print(df.shape)
df.head()


(7032, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

# Split features/labels
X = df.drop("Churn", axis=1)
y = df["Churn"]

# Identify column types
numeric_features = ["tenure", "MonthlyCharges", "TotalCharges"]
categorical_features = [col for col in X.columns if col not in numeric_features]

# Build preprocessors
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), numeric_features),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
])

# Build pipeline
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# Set hyperparameters for GridSearchCV
params = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [None, 10]
}

# Train-test split with stratify
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Grid Search
grid = GridSearchCV(pipeline, params, cv=3, n_jobs=-1)
grid.fit(X_train, y_train)

# Report
print("✅ Best Parameters:", grid.best_params_)
print("📊 Classification Report:\n", classification_report(y_test, grid.predict(X_test)))


✅ Best Parameters: {'classifier__max_depth': None, 'classifier__n_estimators': 100}
📊 Classification Report:
               precision    recall  f1-score   support

          No       0.83      0.90      0.86      1033
         Yes       0.64      0.48      0.55       374

    accuracy                           0.79      1407
   macro avg       0.73      0.69      0.71      1407
weighted avg       0.78      0.79      0.78      1407



In [6]:
# Export the trained pipeline
joblib.dump(grid.best_estimator_, "telco_churn_pipeline.pkl")
print("💾 Model saved as telco_churn_pipeline.pkl")


💾 Model saved as telco_churn_pipeline.pkl
