In [1]:
import os
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
processed_dir = os.path.join(project_root, "notebooks", "data")

X_train = pd.read_csv(os.path.join(processed_dir, "X_train_scaled.csv"))
y_train = pd.read_csv(os.path.join(processed_dir, "y_train.csv"))

X_valid = pd.read_csv(os.path.join(processed_dir, "X_valid_scaled.csv"))
y_valid = pd.read_csv(os.path.join(processed_dir, "y_valid.csv"))

df_test = pd.read_csv(os.path.join(processed_dir, "test_preprocessed.csv"))

X_test = df_test.drop(columns=["Churn"])
y_test = df_test["Churn"]

In [3]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Customer Churn Classification")

with mlflow.start_run(run_name="L1_Regularized_Logistic_Regression") as run:
    print("\n--- Starting Run: L1 Regularized Logistic Regression ---")
    
    # --- a. Define Model and Parameters ---
    lgr_params = {
        'penalty': 'l1',
        'C': 0.1,  # Strong regularization
        'solver': 'liblinear',
        'class_weight': 'balanced',
        'random_state': 42
    }
    lgr = LogisticRegression(**lgr_params)
    
    mlflow.log_params(lgr_params)

    # --- b. Train and Evaluate ---
    print("Training Logistic Regression...")
    lgr.fit(X_train, y_train.values.ravel())
    
    # Evaluate on Validation Set
    y_pred_valid_lgr = lgr.predict(X_valid)
    valid_f1_lgr = f1_score(y_valid, y_pred_valid_lgr)
    mlflow.log_metric("validation_f1_score", valid_f1_lgr)
    print(f"Validation F1-Score (LGR): {valid_f1_lgr:.4f}")

    # Evaluate on Test Set
    y_pred_test_lgr = lgr.predict(X_test)
    test_f1_lgr = f1_score(y_test, y_pred_test_lgr)
    test_accuracy_lgr = accuracy_score(y_test, y_pred_test_lgr)
    test_precision_lgr = precision_score(y_test, y_pred_test_lgr)
    test_recall_lgr = recall_score(y_test, y_pred_test_lgr)
    
    mlflow.log_metric("test_accuracy", test_accuracy_lgr)
    mlflow.log_metric("test_precision", test_precision_lgr)
    mlflow.log_metric("test_recall", test_recall_lgr)
    mlflow.log_metric("test_f1_score", test_f1_lgr)
    print(f"Test F1-Score (LGR): {test_f1_lgr:.4f}")

    # --- c. Log Artifacts and Model ---
    mlflow.sklearn.log_model(lgr, "logistic_regression_model")
    print("--- Run Finished ---")


--- Starting Run: L1 Regularized Logistic Regression ---
Training Logistic Regression...
Validation F1-Score (LGR): 0.8647


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Test F1-Score (LGR): 0.0000




--- Run Finished ---
🏃 View run L1_Regularized_Logistic_Regression at: http://127.0.0.1:5000/#/experiments/849762709934598647/runs/bea21584cfed4f8b88def77574a5726e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/849762709934598647


In [4]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("Customer Churn Classification")

with mlflow.start_run(run_name="Regularized_Random_Forest") as run:
    print("\n--- Starting Run: Regularized Random Forest ---")
    
    # --- a. Define Model and Parameters ---
    rf_params = {
        'n_estimators': 100,
        'max_depth': 7,          # Control tree depth to prevent overfitting
        'min_samples_leaf': 20,  # Ensure leaves are not too specific
        'class_weight': 'balanced',
        'random_state': 42
    }
    rf = RandomForestClassifier(**rf_params)
    
    mlflow.log_params(rf_params)

    # --- b. Train and Evaluate ---
    print("Training Random Forest...")
    rf.fit(X_train, y_train.values.ravel())
    
    # Evaluate on Validation Set
    y_pred_valid_rf = rf.predict(X_valid)
    valid_f1_rf = f1_score(y_valid, y_pred_valid_rf)
    mlflow.log_metric("validation_f1_score", valid_f1_rf)
    print(f"Validation F1-Score (RF): {valid_f1_rf:.4f}")

    # Evaluate on Test Set
    y_pred_test_rf = rf.predict(X_test)
    test_f1_rf = f1_score(y_test, y_pred_test_rf)
    test_accuracy_rf = accuracy_score(y_test, y_pred_test_rf)
    test_precision_rf = precision_score(y_test, y_pred_test_rf)
    test_recall_rf = recall_score(y_test, y_pred_test_rf)
    
    mlflow.log_metric("test_accuracy", test_accuracy_rf)
    mlflow.log_metric("test_precision", test_precision_rf)
    mlflow.log_metric("test_recall", test_recall_rf)
    mlflow.log_metric("test_f1_score", test_f1_rf)
    print(f"Test F1-Score (RF): {test_f1_rf:.4f}")

    # --- c. Log Artifacts and Model ---
    mlflow.sklearn.log_model(rf, "random_forest_model")
    print("--- Run Finished ---")




--- Starting Run: Regularized Random Forest ---
Training Random Forest...
Validation F1-Score (RF): 0.9775




Test F1-Score (RF): 0.6429




--- Run Finished ---
🏃 View run Regularized_Random_Forest at: http://127.0.0.1:5000/#/experiments/849762709934598647/runs/5a3ec930ca4e49f886eb8f84807b0f5c
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/849762709934598647


After exhausting all possible model-level solutions:  
- Complex model (Random Forest)  
- Simple model (Logistic Regression)  
- Regularization to penalize overfitting  

…all of them **failed on the test data**.  
The inevitable conclusion: the issue lies **entirely in the data**.

There is a **complete mismatch** between the statistical distribution of the training and test sets.  
The model learns the rules of one game, but is then asked to play an entirely different game with different rules. Under such conditions, no model can succeed.

---

## ✅ Proposed Final Solution: Data Unification & Re-Splitting
Since the root problem is in the data, the solution must also come from the data.  
The logical next step is to create a consistent and homogeneous training–testing environment.

**Plan:**  
1. **Merge the data**: Combine the original training and test files into one dataset.  
2. **Full shuffle**: Randomly shuffle the unified dataset to eliminate any order or bias.  
3. **Re-split**: Divide the shuffled dataset again into training, validation, and test sets with proper ratios.  

This process ensures that the model is both trained and tested on data from the **same statistical source**, creating a realistic and valid scenario for learning and success.