In [1]:
import os
import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
processed_dir = os.path.join(project_root, "notebooks", "data")

X_train = pd.read_csv(os.path.join(processed_dir, "X_train_scaled_merged.csv"))
y_train = pd.read_csv(os.path.join(processed_dir, "y_train_merged.csv"))      

X_valid = pd.read_csv(os.path.join(processed_dir, "X_valid_scaled_merged.csv"))
y_valid = pd.read_csv(os.path.join(processed_dir, "y_valid_merged.csv"))

X_test = pd.read_csv(os.path.join(processed_dir, "X_test_scaled_merged.csv"))
y_test = pd.read_csv(os.path.join(processed_dir, "y_test_merged.csv"))

In [4]:
mlflow.set_experiment("Customer_Churn_New_Split")


with mlflow.start_run(run_name="Baseline_RandomForestClassifier") as run:
    print("\n--- Starting Run: RandomForestClassifier ---")

    rf_params = {
        'n_estimators': 100,
        'max_depth': None, 
        'class_weight': 'balanced',
        'random_state': 42
    }
    rf = RandomForestClassifier(**rf_params)
    
    mlflow.log_params(rf_params)

    # --- b. Train the Model ---
    print("RandomForestClassifier...")
    # Make sure y_train, y_valid, y_test are 1D arrays using .values.ravel()
    rf.fit(X_train, y_train.values.ravel())
    
    # --- c. Evaluate on Validation Set ---
    print("Evaluating on Validation Set...")
    y_pred_val = rf.predict(X_valid)
    val_f1 = f1_score(y_valid, y_pred_val)
    val_accuracy = accuracy_score(y_valid, y_pred_val)
    
    mlflow.log_metric("validation_accuracy", val_accuracy)
    mlflow.log_metric("validation_f1_score", val_f1)
    print(f"Validation F1-Score: {val_f1:.4f}")

    # --- d. Evaluate on Test Set ---
    print("Evaluating on Test Set...")
    y_pred_test = rf.predict(X_test)
    test_f1 = f1_score(y_test, y_pred_test)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_precision = precision_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)
    
    mlflow.log_metric("test_accuracy", test_accuracy)
    mlflow.log_metric("test_precision", test_precision)
    mlflow.log_metric("test_recall", test_recall)
    mlflow.log_metric("test_f1_score", test_f1)
    print(f"Test F1-Score: {test_f1:.4f}")

    # --- e. Log the Model ---
    mlflow.sklearn.log_model(rf, "logistic_regression_model")
    
    print("--- Run Finished ---")


--- Starting Run: RandomForestClassifier ---
RandomForestClassifier...
Evaluating on Validation Set...
Validation F1-Score: 0.9450
Evaluating on Test Set...




Test F1-Score: 0.9450




--- Run Finished ---


# 🌲 Final Model Results: Random Forest Classifier

## 🚀 Key Findings
- **Outstanding Performance**: Test F1 = **0.945**, far above the Logistic Regression baseline.  
- **Near-Perfect Recall**: Recall = **0.998** → 99.8% of churners correctly identified.  
- **High Precision**: Precision = **0.897** → nearly 90% of flagged churners are correct.  
- **Proven Stability**: Validation and test metrics are nearly identical → no overfitting.  

---

## 📈 Performance Metrics
| Metric     | Validation | Test   |
|------------|------------|--------|
| F1-Score   | 0.9450     | 0.9450 |
| Accuracy   | 0.9355     | 0.9354 |
| Precision  | -          | 0.8971 |
| Recall     | -          | 0.9982 |

---

## ✅ Conclusion
The **Random Forest** is a clear success:  
- High F1-score  
- Exceptional recall  
- Strong precision  
- Proven stability  

👉 This makes it the **ideal model for churn prediction**, offering both accuracy and business value by minimizing missed opportunities for customer retention.  
