In [1]:
import os
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np 

In [2]:
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
processed_dir = os.path.join(project_root, "notebooks", "data")

X_train = pd.read_csv(os.path.join(processed_dir, "X_train_scaled.csv"))
y_train = pd.read_csv(os.path.join(processed_dir, "y_train.csv"))

X_valid = pd.read_csv(os.path.join(processed_dir, "X_valid_scaled.csv"))
y_valid = pd.read_csv(os.path.join(processed_dir, "y_valid.csv"))

df_test = pd.read_csv(os.path.join(processed_dir, "test_preprocessed.csv"))
# Assuming the test set has a 'Churn' column for evaluation
X_test = df_test.drop(columns=["Churn"])
y_test = df_test["Churn"]

In [3]:
mlflow.set_experiment("Customer Churn Classification")

# Start a new MLflow run
with mlflow.start_run(run_name="Baseline RandomForest") as run:
    print(f"Starting run: {run.info.run_name}")
    

    rf_params = {
        'n_estimators': 100,
        'max_depth': None, 
        'class_weight': 'balanced',
        'random_state': 42
    }
    rf = RandomForestClassifier(**rf_params)
    
    # Log the model's parameters
    print("Logging parameters...")
    mlflow.log_params(rf_params)

    # --- b. Train the Model ---
    print("Training the model...")
    # Use .values.ravel() to ensure y_train is a 1D array, which sklearn expects
    rf.fit(X_train, y_train.values.ravel())

    # --- c. Evaluate on Validation Set ---
    print("Evaluating on validation set...")
    y_pred_valid = rf.predict(X_valid)
    
    # Calculate validation metrics
    valid_accuracy = accuracy_score(y_valid, y_pred_valid)
    valid_precision = precision_score(y_valid, y_pred_valid)
    valid_recall = recall_score(y_valid, y_pred_valid)
    valid_f1 = f1_score(y_valid, y_pred_valid)
    
    # Log validation metrics
    print("Logging validation metrics...")
    validation_metrics = {
        "validation_accuracy": valid_accuracy,
        "validation_precision": valid_precision,
        "validation_recall": valid_recall,
        "validation_f1_score": valid_f1
    }
    mlflow.log_metrics(validation_metrics)
    print(f"Validation F1-Score: {valid_f1:.4f}")

    # --- d. Evaluate on Test Set ---
    print("Evaluating on test set...")
    y_pred_test = rf.predict(X_test)

    # Calculate test metrics
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_precision = precision_score(y_test, y_pred_test)
    test_recall = recall_score(y_test, y_pred_test)
    test_f1 = f1_score(y_test, y_pred_test)

    # Log test metrics
    print("Logging test metrics...")
    test_metrics = {
        "test_accuracy": test_accuracy,
        "test_precision": test_precision,
        "test_recall": test_recall,
        "test_f1_score": test_f1
    }
    mlflow.log_metrics(test_metrics)
    print(f"Test F1-Score: {test_f1:.4f}")

    # --- e. Log Artifacts (like plots) ---
    print("Generating and logging confusion matrix...")
    cm = confusion_matrix(y_test, y_pred_test)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    disp.plot()
    plt.title("Test Set Confusion Matrix")
    
    # Save the plot to a file and log it as an artifact
    plt.savefig("test_confusion_matrix.png")
    mlflow.log_artifact("test_confusion_matrix.png", "plots")
    plt.close()

    # --- f. Log the Model ---
    print("Logging the model...")
    mlflow.sklearn.log_model(rf, "random_forest_model")
    
    print("\n✅ Run completed successfully!")
    print(f"To see your run, open a terminal and type: mlflow ui")

Starting run: Baseline RandomForest
Logging parameters...
Training the model...
Evaluating on validation set...
Logging validation metrics...
Validation F1-Score: 0.9998
Evaluating on test set...
Logging test metrics...




Test F1-Score: 0.6429
Generating and logging confusion matrix...
Logging the model...





✅ Run completed successfully!
To see your run, open a terminal and type: mlflow ui


In [4]:
!mlflow ui

^C


#  Overfitting Analysis & Proposed Next Step

The model shows **strong performance on the training set** but performs **significantly worse on the testing set** — a clear indication of **overfitting**.

---

## ⚠ Root Cause  
The training data contains overly simplistic, deterministic patterns that the model has **memorized** rather than learning **generalizable trends**.  

These "perfect rules" include:

- 📌 All customers with **monthly contracts** churned  
- 📌 All customers with **more than 5 support calls** churned  
- 📌 All customers **over 50 years old** churned  
- 📌 All customers with **payment delays over 20 days** churned  

These absolute patterns **do not hold true** in validation or test data, leading to poor generalization.

---

## ✅ Proposed Next Step  
To address overfitting, the next step is to try a **simpler model** such as **Logistic Regression**.  

- A **less complex model** is less prone to memorizing artifacts in the training data.  
- It is more likely to capture **robust, generalizable patterns** that extend to unseen samples.  
