# Telco Customer Churn: MLflow Experiment Tracking

## 1. Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import sys
import os
import yaml
import mlflow
import mlflow.sklearn
from datetime import datetime

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

from src.data_loader import TelcoDataLoader
from src.preprocessor import DataPreprocessor
from src.ensemble_models import RandomForestChurnModel, XGBoostChurnModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

with open('../config/config.yaml', 'r') as f:
    config = yaml.safe_load(f)

print('Setup complete')

## 2. Configure MLflow

In [None]:
mlflow_tracking_uri = config['mlflow']['tracking_uri']
experiment_name = config['mlflow']['experiment_name']
mlflow.set_experiment(experiment_name)
print(f'Experiment: {experiment_name}')

## 3. Load and Prepare Data

In [None]:
loader = TelcoDataLoader()
df = loader.load_raw_data()
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
df.dropna(subset=['TotalCharges'], inplace=True)

target_col = config['target']
X = df.drop(target_col, axis=1)
y = (df[target_col] == 'Yes').astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=config['training']['test_size'], 
    random_state=config['training']['random_state'], stratify=y
)

preprocessor = DataPreprocessor().create_preprocessing_pipeline()
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

print(f'Data prepared: train={X_train_transformed.shape}, test={X_test_transformed.shape}')

## 4. Train Models with MLflow Tracking

In [None]:
def train_and_log_model(model, model_name, X_train, X_test, y_train, y_test, params):
    with mlflow.start_run(run_name=f'{model_name}_{datetime.now().strftime("%Y%m%d_%H%M%S")}'):
        mlflow.log_params(params)
        model.train(X_train, y_train)
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        
        metrics = {
            'accuracy': accuracy_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'f1_score': f1_score(y_test, y_pred),
            'roc_auc': roc_auc_score(y_test, y_pred_proba)
        }
        
        mlflow.log_metrics(metrics)
        mlflow.sklearn.log_model(model.model, 'model')
        mlflow.set_tag('model_type', model_name)
        
        print(f'{model_name} - ROC AUC: {metrics["roc_auc"]:.4f}')
        return metrics

In [None]:
rf_params = config['models']['random_forest']
rf_model = RandomForestChurnModel(**rf_params)
rf_metrics = train_and_log_model(rf_model, 'RandomForest', X_train_transformed, X_test_transformed, y_train, y_test, rf_params)

xgb_params = config['models']['xgboost']
xgb_model = XGBoostChurnModel(**xgb_params)
xgb_metrics = train_and_log_model(xgb_model, 'XGBoost', X_train_transformed, X_test_transformed, y_train, y_test, xgb_params)

## 5. Compare Models

In [None]:
comparison_df = pd.DataFrame({'RandomForest': rf_metrics, 'XGBoost': xgb_metrics}).T
print(comparison_df)
print(f'Best Model: {comparison_df["roc_auc"].idxmax()}')