In [None]:

!pip install scikit-learn

# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    roc_auc_score, confusion_matrix, roc_curve
)
import matplotlib.pyplot as plt

df = pd.read_csv("data/cleaned/telco_churn_cleaned.csv")
df.head()

# Split the data into features and target variable
# 'customerid' is dropped as it is not a feature for prediction
X = df.drop(columns=["customerid", "churn"])
y = df["churn"]

# Split the dataset into training and testing sets
# Using stratified sampling to maintain the proportion of churn classes
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Standardize numerical features
# 'tenure', 'monthlycharges', and 'totalcharges' are the numerical features
num_cols = ["tenure", "monthlycharges", "totalcharges"]
scaler = StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols]  = scaler.transform(X_test[num_cols])

#Define Hyperparameter Grid & Set Up CV
param_grid_lr = {
    "C":          [0.01, 0.1, 1, 10],
    "penalty":    ["l2"],
    "solver":     ["liblinear"]
}

base_lr = LogisticRegression(random_state=42)

grid_lr = GridSearchCV(
    estimator=base_lr,
    param_grid=param_grid_lr,
    scoring="roc_auc",
    cv=5,
    n_jobs=-1,
    verbose=1
)

# Run GridSearchCV
grid_lr.fit(X_train, y_train)

#Review CV Results
print("Best CV AUC:        ", grid_lr.best_score_)
print("Best hyperparameters:", grid_lr.best_params_)

#Evaluate Best Model on Hold-Out Test Set
best_lr = grid_lr.best_estimator_

y_pred_lr  = best_lr.predict(X_test)
y_proba_lr = best_lr.predict_proba(X_test)[:, 1]

print("Test Accuracy :", accuracy_score(y_test, y_pred_lr))
print("Test Precision:", precision_score(y_test, y_pred_lr))
print("Test Recall   :", recall_score(y_test, y_pred_lr))
print("Test ROC AUC  :", roc_auc_score(y_test, y_proba_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))

# Plot ROC Curve
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_proba_lr)

plt.figure()
plt.plot(fpr_lr, tpr_lr, label=f'LogReg (AUC = {roc_auc_score(y_test, y_proba_lr):.3f})')
plt.plot([0, 1], [0, 1], '--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Logistic Regression ROC Curve")
plt.legend(loc="lower right")
plt.show()

#Inspect Coefficients
import numpy as np

coef = pd.Series(best_lr.coef_[0], index=X.columns)
coef = coef.sort_values(key=np.abs, ascending=False)

# Show top 10 most influential features (by absolute value)
coef.head(10)
