# DEVELOPERS_HUB_ASSIGNMENT2_TASK2

In [1]:
!pip install scikit-learn pandas joblib


Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install scikit-learn pandas numpy joblib matplotlib seaborn


Defaulting to user installation because normal site-packages is not writeable


In [6]:
# churn_pipeline.py

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib

# -------------------------
# Load Dataset
# -------------------------
# Telco churn dataset (replace with your path if CSV is local)
df = pd.read_csv("C:/Users/Habeeban Memon/Desktop/Assignment2/WA_Fn-UseC_-Telco-Customer-Churn.csv")


# Drop customerID (not useful)
df = df.drop("customerID", axis=1)

# Target
y = df["Churn"].apply(lambda x: 1 if x == "Yes" else 0)
X = df.drop("Churn", axis=1)

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns

# -------------------------
# Preprocessing
# -------------------------
numeric_transformer = Pipeline(steps=[
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

# -------------------------
# Models
# -------------------------
log_reg = LogisticRegression(max_iter=1000)
rf_clf = RandomForestClassifier(random_state=42)

# Pipelines
log_reg_pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", log_reg)
])

rf_pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", rf_clf)
])

# -------------------------
# Hyperparameter Tuning with GridSearchCV
# -------------------------
param_grid_logreg = {
    "classifier__C": [0.01, 0.1, 1, 10],
    "classifier__solver": ["liblinear", "lbfgs"]
}

param_grid_rf = {
    "classifier__n_estimators": [50, 100],
    "classifier__max_depth": [5, 10, None]
}

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# GridSearch for Logistic Regression
grid_logreg = GridSearchCV(log_reg_pipe, param_grid_logreg, cv=3, scoring="accuracy")
grid_logreg.fit(X_train, y_train)

# GridSearch for Random Forest
grid_rf = GridSearchCV(rf_pipe, param_grid_rf, cv=3, scoring="accuracy")
grid_rf.fit(X_train, y_train)

# -------------------------
# Evaluate
# -------------------------
print("Best Logistic Regression Params:", grid_logreg.best_params_)
print("Best Logistic Regression Accuracy:", grid_logreg.best_score_)

print("Best Random Forest Params:", grid_rf.best_params_)
print("Best Random Forest Accuracy:", grid_rf.best_score_)

# Choose best model
best_model = grid_rf if grid_rf.best_score_ > grid_logreg.best_score_ else grid_logreg

# Final evaluation
y_pred = best_model.predict(X_test)
print("\nFinal Model Report:")
print(classification_report(y_test, y_pred))
print("Test Accuracy:", accuracy_score(y_test, y_pred))

# -------------------------
# Save pipeline
# -------------------------
joblib.dump(best_model.best_estimator_, "churn_pipeline.pkl")
print("✅ Pipeline exported as churn_pipeline.pkl")


Best Logistic Regression Params: {'classifier__C': 0.01, 'classifier__solver': 'liblinear'}
Best Logistic Regression Accuracy: 0.8004969826056088
Best Random Forest Params: {'classifier__max_depth': None, 'classifier__n_estimators': 100}
Best Random Forest Accuracy: 0.7873624423145191

Final Model Report:
              precision    recall  f1-score   support

           0       0.85      0.92      0.88      1036
           1       0.71      0.57      0.63       373

    accuracy                           0.82      1409
   macro avg       0.78      0.74      0.76      1409
weighted avg       0.81      0.82      0.82      1409

Test Accuracy: 0.8225691980127751
✅ Pipeline exported as churn_pipeline.pkl
