Cell 1: Imports + Load data

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_auc_score,
    average_precision_score
)

import joblib


In [2]:
# ===============================
# Paths (Notebook-safe)
# ===============================
PROJECT_ROOT = Path.cwd().parent  # notebooks -> project_3_churn_prediction
DATA_PATH = PROJECT_ROOT / "data" / "processed" / "churn_processed.csv"

OUTPUT_DIR = PROJECT_ROOT / "outputs"
MODEL_DIR = OUTPUT_DIR / "models"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_PATH:", DATA_PATH)
print("Exists:", DATA_PATH.exists())
print("MODEL_DIR:", MODEL_DIR)


PROJECT_ROOT: c:\Users\Lenovo\Desktop\Portfolio\data-portfolio-nichagan\project_3_churn_prediction
DATA_PATH: c:\Users\Lenovo\Desktop\Portfolio\data-portfolio-nichagan\project_3_churn_prediction\data\processed\churn_processed.csv
Exists: True
MODEL_DIR: c:\Users\Lenovo\Desktop\Portfolio\data-portfolio-nichagan\project_3_churn_prediction\outputs\models


Cell 2: Target cleaning (Yes/No -> 0/1) + check

In [9]:
import pandas as pd
from pathlib import Path

# --- paths (robust for notebooks) ---
PROJECT_ROOT = Path.cwd().parent   # now at .../project_3_churn_prediction
RAW_PATH = PROJECT_ROOT / "data" / "raw" / "telco-customer-churn.csv"

df = pd.read_csv(RAW_PATH)

# --- target (short) ---
target_col = "Churn" if "Churn" in df.columns else df.columns[-1]
df[target_col] = df[target_col].astype(str).str.strip().str.lower().map({"yes": 1, "no": 0}).astype(int)

print("CWD:", Path.cwd())
print("RAW_PATH:", RAW_PATH, "| Exists:", RAW_PATH.exists())
print("target_col =", target_col)
print("\nValue counts:\n", df[target_col].value_counts())
print("\nNormalized value counts:\n", df[target_col].value_counts(normalize=True))


CWD: c:\Users\Lenovo\Desktop\Portfolio\data-portfolio-nichagan\project_3_churn_prediction\notebooks
RAW_PATH: c:\Users\Lenovo\Desktop\Portfolio\data-portfolio-nichagan\project_3_churn_prediction\data\raw\telco-customer-churn.csv | Exists: True
target_col = Churn

Value counts:
 Churn
0    5174
1    1869
Name: count, dtype: int64

Normalized value counts:
 Churn
0    0.73463
1    0.26537
Name: proportion, dtype: float64


Cell 3: Split X/y + Train/test split

In [10]:
# ===============================
# Split features/target
# ===============================
y = df[target_col]
X = df.drop(columns=[target_col])

# (optional) drop ID column if exists
for id_col in ["customerID", "CustomerID", "customer_id", "Row ID", "RowID"]:
    if id_col in X.columns:
        X = X.drop(columns=[id_col])
        print("Dropped ID column:", id_col)

# ===============================
# Train/test split (stratify สำคัญมาก)
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_train.shape, X_test.shape


Dropped ID column: customerID


((5634, 19), (1409, 19))

Cell 4: Build preprocess (ColumnTransformer)

In [11]:
# ===============================
# Column types
# ===============================
num_cols = X_train.select_dtypes(include=["number"]).columns.tolist()
cat_cols = [c for c in X_train.columns if c not in num_cols]

print("Numeric cols:", len(num_cols))
print("Categorical cols:", len(cat_cols))
print("Example cat cols:", cat_cols[:10])


Numeric cols: 3
Categorical cols: 16
Example cat cols: ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport']


In [12]:
# ===============================
# Preprocess pipelines
# ===============================
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ],
    remainder="drop"
)


Cell 5: Train baseline models (LogReg + RF)

In [13]:
# ===============================
# Model 1: Logistic Regression (baseline)
# ===============================
logreg_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=2000))
])

logreg_model.fit(X_train, y_train)

# ===============================
# Model 2: Random Forest (strong baseline)
# ===============================
rf_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", RandomForestClassifier(
        n_estimators=400,
        random_state=42,
        class_weight="balanced"
    ))
])

rf_model.fit(X_train, y_train)

print("✅ Training done.")


✅ Training done.


Cell 6: Evaluate function + Evaluate both models

In [14]:
def evaluate(model, X_test, y_test, name="model"):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    roc = roc_auc_score(y_test, y_proba)
    pr = average_precision_score(y_test, y_proba)
    cm = confusion_matrix(y_test, y_pred)

    print(f"\n====== {name} ======")
    print("ROC-AUC:", roc)
    print("PR-AUC:", pr)
    print("Confusion Matrix:\n", cm)
    print("Classification Report:\n", classification_report(y_test, y_pred))

    return {"name": name, "roc_auc": roc, "pr_auc": pr}

res_lr = evaluate(logreg_model, X_test, y_test, "LogisticRegression")
res_rf = evaluate(rf_model, X_test, y_test, "RandomForest")

res_lr, res_rf



ROC-AUC: 0.8402981218837996
PR-AUC: 0.6351133262986087
Confusion Matrix:
 [[917 118]
 [172 202]]
Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.89      0.86      1035
           1       0.63      0.54      0.58       374

    accuracy                           0.79      1409
   macro avg       0.74      0.71      0.72      1409
weighted avg       0.79      0.79      0.79      1409


ROC-AUC: 0.8214317083882301
PR-AUC: 0.6046770159870083
Confusion Matrix:
 [[916 119]
 [188 186]]
Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.89      0.86      1035
           1       0.61      0.50      0.55       374

    accuracy                           0.78      1409
   macro avg       0.72      0.69      0.70      1409
weighted avg       0.77      0.78      0.77      1409



({'name': 'LogisticRegression',
  'roc_auc': 0.8402981218837996,
  'pr_auc': 0.6351133262986087},
 {'name': 'RandomForest',
  'roc_auc': 0.8214317083882301,
  'pr_auc': 0.6046770159870083})

Cell 7: Save best model เป็น best_model.pkl

In [15]:
# ===============================
# Choose best by ROC-AUC
# ===============================
best_model = rf_model if res_rf["roc_auc"] >= res_lr["roc_auc"] else logreg_model
best_name = "RandomForest" if best_model is rf_model else "LogisticRegression"

best_model_path = MODEL_DIR / "best_model.pkl"
joblib.dump(best_model, best_model_path)

print("✅ Saved best model:", best_name)
print("Path:", best_model_path)
print("Exists:", best_model_path.exists())


✅ Saved best model: LogisticRegression
Path: c:\Users\Lenovo\Desktop\Portfolio\data-portfolio-nichagan\project_3_churn_prediction\outputs\models\best_model.pkl
Exists: True
