# Model Training with MLflow Tracking

This notebook demonstrates how to:
1. Train a machine learning model
2. Track experiments with MLflow
3. Save the model for production deployment
4. Log model metrics and parameters

**Model**: Customer Churn Prediction (Binary Classification)

**MLflow Features**:
- Experiment tracking
- Parameter logging
- Metric logging
- Model versioning

In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# ML libraries
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    confusion_matrix,
    classification_report
)

# Model saving
import joblib

# MLflow for experiment tracking
import mlflow
import mlflow.sklearn

# Configuration
%matplotlib inline
sns.set_style('whitegrid')
np.random.seed(42)

print("✓ All libraries imported successfully")

## 1. Generate Synthetic Data

For this demo, we'll create synthetic customer churn data.
In production, you would load real data from your data warehouse.

In [None]:
# Generate synthetic customer data
np.random.seed(42)
n_samples = 5000

# Features
data = {
    'tenure_months': np.random.randint(1, 72, n_samples),
    'monthly_charges': np.random.uniform(20, 150, n_samples),
    'total_charges': np.random.uniform(100, 8000, n_samples),
    'contract_length': np.random.choice([1, 12, 24], n_samples),
    'num_products': np.random.randint(1, 5, n_samples),
    'support_tickets': np.random.randint(0, 10, n_samples),
    'satisfaction_score': np.random.uniform(1, 5, n_samples)
}

df = pd.DataFrame(data)

# Create target: churn is more likely with:
# - Low tenure
# - High monthly charges
# - Many support tickets
# - Low satisfaction
churn_probability = (
    0.5 -
    (df['tenure_months'] / 200) +
    (df['monthly_charges'] / 300) +
    (df['support_tickets'] / 20) -
    (df['satisfaction_score'] / 10)
)
df['churn'] = (np.random.random(n_samples) < churn_probability).astype(int)

print(f"Generated {len(df)} customer records")
print(f"Churn rate: {df['churn'].mean():.2%}")
print(f"\nFeatures: {df.columns.tolist()}")
df.head()

In [None]:
# Exploratory data analysis
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

# Churn distribution
df['churn'].value_counts().plot(kind='bar', ax=axes[0, 0])
axes[0, 0].set_title('Churn Distribution')
axes[0, 0].set_ylabel('Count')

# Tenure vs Churn
df.groupby('churn')['tenure_months'].hist(alpha=0.6, bins=20, ax=axes[0, 1])
axes[0, 1].set_title('Tenure by Churn Status')
axes[0, 1].legend(['No Churn', 'Churn'])

# Monthly charges vs Churn
df.boxplot(column='monthly_charges', by='churn', ax=axes[1, 0])
axes[1, 0].set_title('Monthly Charges by Churn')

# Correlation heatmap
sns.heatmap(df.corr(), annot=True, fmt='.2f', ax=axes[1, 1], cmap='coolwarm')
axes[1, 1].set_title('Feature Correlations')

plt.tight_layout()
plt.show()

## 2. Data Preparation

Split data into training and test sets, then scale features.

In [None]:
# Separate features and target
X = df.drop('churn', axis=1)
y = df['churn']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Feature names: {X.columns.tolist()}")

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Train churn rate: {y_train.mean():.2%}")
print(f"Test churn rate: {y_test.mean():.2%}")

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("✓ Features scaled successfully")

## 3. Model Training with MLflow

Train a Random Forest classifier and track the experiment with MLflow.

In [None]:
# Set MLflow tracking URI (use local for this demo)
mlflow.set_tracking_uri("file:./mlruns")
mlflow.set_experiment("customer_churn_prediction")

print("✓ MLflow experiment configured")
print(f"  Tracking URI: {mlflow.get_tracking_uri()}")
print(f"  Experiment: {mlflow.get_experiment_by_name('customer_churn_prediction').name}")

In [None]:
# Start MLflow run
with mlflow.start_run(run_name="random_forest_v1"):

    # Model hyperparameters
    params = {
        'n_estimators': 100,
        'max_depth': 10,
        'min_samples_split': 5,
        'random_state': 42
    }

    # Log parameters
    mlflow.log_params(params)

    # Train model
    print("Training Random Forest model...")
    model = RandomForestClassifier(**params)
    model.fit(X_train_scaled, y_train)
    print("✓ Model trained successfully")

    # Make predictions
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

    # Calculate metrics
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_pred_proba)
    }

    # Log metrics
    mlflow.log_metrics(metrics)

    # Log model
    mlflow.sklearn.log_model(model, "model")

    # Print results
    print("\n=== Model Performance ===")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name:15s}: {metric_value:.4f}")

    print("\n✓ Metrics logged to MLflow")

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['No Churn', 'Churn']))

In [None]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(feature_importance['feature'], feature_importance['importance'])
plt.xlabel('Importance')
plt.title('Feature Importance')
plt.gca().invert_yaxis()
plt.show()

print("\nTop 3 Most Important Features:")
print(feature_importance.head(3))

## 4. Save Model for Production

Save the trained model to the `models/` directory for deployment.

In [None]:
# Create models directory if it doesn't exist
models_dir = Path('../models')
models_dir.mkdir(exist_ok=True)

# Save model
model_path = models_dir / 'model.pkl'
joblib.dump(model, model_path)

print(f"✓ Model saved to: {model_path}")
print(f"  File size: {model_path.stat().st_size / 1024:.2f} KB")

In [None]:
# Test loading the model
loaded_model = joblib.load(model_path)

# Verify it works
test_prediction = loaded_model.predict(X_test_scaled[:5])
print("✓ Model loaded and tested successfully")
print(f"  Sample predictions: {test_prediction}")

## 5. Create Sample Prediction Request

Generate example requests for testing the API.

In [None]:
# Create sample request
sample_customer = X_test.iloc[0].to_dict()

print("Sample API request:")
print("-" * 50)
print("POST /predict")
print("Content-Type: application/json")
print("\nBody:")
import json
request_body = {
    "features": sample_customer,
    "use_cache": True
}
print(json.dumps(request_body, indent=2))

# Get expected prediction
sample_scaled = scaler.transform([list(sample_customer.values())])
expected_prediction = model.predict(sample_scaled)[0]
expected_proba = model.predict_proba(sample_scaled)[0][1]

print("\nExpected response:")
print(f"  Prediction: {expected_prediction}")
print(f"  Probability: {expected_proba:.4f}")

## Summary

This notebook demonstrated:

1. ✓ Generating synthetic customer churn data
2. ✓ Training a Random Forest classifier
3. ✓ Tracking experiments with MLflow
4. ✓ Evaluating model performance
5. ✓ Saving model for production deployment

**Next Steps**:
1. Start the FastAPI application: `docker-compose up`
2. Test the `/predict` endpoint with sample data
3. View metrics in Prometheus: http://localhost:9090
4. Create dashboards in Grafana: http://localhost:3000
5. View MLflow tracking: `mlflow ui` then http://localhost:5000

**Model Versioning**:
- To deploy a new model version, retrain and save to `models/model.pkl`
- Call `/model/reload` endpoint to hot-reload without restarting
- Set `MODEL_VERSION` environment variable to track versions