In [1]:
# 02_Modeling.ipynb
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import joblib
import mlflow

In [2]:
# 📥 Load Raw Dataset
df = pd.read_csv("../datasets/Telco-Customer-Churn.csv")

# 🧹 Data Cleaning
df.replace(" ", np.nan, inplace=True)
df.dropna(inplace=True)

# 🔄 Convert TotalCharges to numeric
df['TotalCharges'] = df['TotalCharges'].astype(float)

# 🎯 Encode target
df['Churn'] = df['Churn'].map({'Yes': 1, 'No': 0})

# 🛠️ Drop customerID (not predictive)
df.drop('customerID', axis=1, inplace=True)

# 🔁 Encode categorical variables
cat_cols = df.select_dtypes(include='object').columns
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# 💾 Save cleaned data
df_encoded.to_csv("../datasets/cleaned_telco_data.csv", index=False)

print("✅ Cleaned data saved to ../data/cleaned_telco_data.csv")

✅ Cleaned data saved to ../data/cleaned_telco_data.csv


In [3]:
# 📦 Load Data
try:
    df = pd.read_csv("../datasets/cleaned_telco_data.csv")
    print(f"Shape after loading: {df.shape}")
except FileNotFoundError:
    print("Error: The CSV file was not found. Please check the path: ../datasets/cleaned_telco_data.csv")
    exit() # Exit if the file isn't found, as there's no data to process

if df.empty:
    print("Error: The DataFrame is empty immediately after reading the CSV. The CSV might be empty or corrupted.")
    exit()

# 🎯 Encode Target and handle NaNs
# Inspect unique values to catch unexpected entries
print("Unique values in 'Churn' before mapping:")
print(df['Churn'].unique()) # Uncomment to inspect


# Separate features (X) and target (y)
X = df.drop(columns=['Churn'])
y = df['Churn']

print(f"\nFinal X shape before split: {X.shape}")
print(f"Final y shape before split: {y.shape}")

Shape after loading: (7032, 31)
Unique values in 'Churn' before mapping:
[0 1]

Final X shape before split: (7032, 30)
Final y shape before split: (7032,)


In [4]:
# 🔄 Train/Test Split
# Crucial check: Is X empty? Is y empty?
if X.empty or y.empty:
    print("Error: X or y is empty before train_test_split. This means all your data was lost during previous processing steps.")
    print("Please review the 'Shape after...' print statements above to identify where the data was lost.")
else:
    # ✨ Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Verify the shapes of your new datasets
    print("\n---")
    print("Data Split Successful!")
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    print(f"y_train shape: {y_train.shape}")
    print(f"y_test shape: {y_test.shape}")
    print("---")

    # 🧪 Scale Features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    joblib.dump(scaler, "../models/scaler.joblib")


---
Data Split Successful!
X_train shape: (5625, 30)
X_test shape: (1407, 30)
y_train shape: (5625,)
y_test shape: (1407,)
---


In [10]:
# 🚀 Initialize MLflow
mlflow.set_experiment("Telco Churn Prediction 1")
mlflow.set_tracking_uri("http://localhost")  # Adjust if using a different MLflow server

best_score = 0
best_model = None
best_model_name = ""

# 📌 Model Candidates
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000),
    "RandomForest": RandomForestClassifier(n_estimators=100),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

for name, model in models.items():
    with mlflow.start_run(run_name=name):
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_proba = model.predict_proba(X_test_scaled)[:, 1]

        acc = model.score(X_test_scaled, y_test)
        roc = roc_auc_score(y_test, y_proba)

        mlflow.log_artifacts("../datasets/cleaned_telco_data.csv", artifact_path="data")
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("roc_auc", roc)
        mlflow.log_params(model.get_params())
        mlflow.sklearn.log_model(model, "model", input_example=X_train.head(1))

        print(f"{name} Classification Report:\n", classification_report(y_test, y_pred))

        if roc > best_score:
            best_score = roc
            best_model = model
            best_model_name = name
            best_run_id = mlflow.active_run().info.run_id

print(f"\n✅ Best Model: {best_model_name} with ROC-AUC: {best_score:.4f}")
joblib.dump(best_model, "../scripts/best_churn_model.joblib")

print(f"Best model saved and run ID for API: {best_run_id}")



🏃 View run LogisticRegression at: http://localhost/#/experiments/2/runs/597c8be0fa5d4faea9004cd61d490eda
🧪 View experiment at: http://localhost/#/experiments/2


EndpointConnectionError: Could not connect to the endpoint URL: "http://minio:9000/mlflow/2/597c8be0fa5d4faea9004cd61d490eda/artifacts/model/MLmodel"