In [None]:
# Import required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification, make_regression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression

# Import ds_utils ML evaluation functions
from ds_utils.ml_eval import (
    classification_summary,
    regression_summary,
    plot_confusion_matrix,
    plot_roc_curve,
    plot_precision_recall_curve,
    plot_calibration_curve,
    plot_residuals,
    plot_prediction_error,
    plot_residual_distribution,
    plot_feature_importance,
    plot_learning_curve,
    plot_validation_curve,
)
from ds_utils.plotting import apply_corporate_style

apply_corporate_style()

## 1. Classification Example

In [None]:
# Generate classification data
X_clf, y_clf = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=10,
    n_redundant=5,
    n_classes=2,
    random_state=42,
)

feature_names = [f'feature_{i}' for i in range(X_clf.shape[1])]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42
)

# Train a classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predictions
y_pred = clf.predict(X_test)
y_proba = clf.predict_proba(X_test)[:, 1]

print(f"Training samples: {len(X_train)}")
print(f"Test samples: {len(X_test)}")

In [None]:
# Classification metrics summary
metrics = classification_summary(y_test, y_pred)
print("Classification Metrics:")
display(metrics)

In [None]:
# Confusion Matrix
fig, ax = plot_confusion_matrix(
    y_test, y_pred,
    labels=['Negative', 'Positive'],
    title='Confusion Matrix',
)
plt.show()

In [None]:
# Normalized Confusion Matrix
fig, ax = plot_confusion_matrix(
    y_test, y_pred,
    labels=['Negative', 'Positive'],
    normalize='true',
    title='Normalized Confusion Matrix',
)
plt.show()

In [None]:
# ROC Curve
fig, ax = plot_roc_curve(
    y_test, y_proba,
    title='ROC Curve',
    show_auc=True,
)
plt.show()

In [None]:
# Precision-Recall Curve
fig, ax = plot_precision_recall_curve(
    y_test, y_proba,
    title='Precision-Recall Curve',
    show_ap=True,
)
plt.show()

In [None]:
# Calibration Curve
fig, ax = plot_calibration_curve(
    y_test, y_proba,
    title='Calibration Curve',
    n_bins=10,
)
plt.show()

## 2. Regression Example

In [None]:
# Generate regression data
X_reg, y_reg = make_regression(
    n_samples=500,
    n_features=10,
    n_informative=5,
    noise=20,
    random_state=42,
)

# Train/test split
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_reg, y_reg, test_size=0.2, random_state=42
)

# Train a regressor
reg = RandomForestRegressor(n_estimators=100, random_state=42)
reg.fit(X_train_reg, y_train_reg)

# Predictions
y_pred_reg = reg.predict(X_test_reg)

print(f"Training samples: {len(X_train_reg)}")
print(f"Test samples: {len(X_test_reg)}")

In [None]:
# Regression metrics summary
metrics_reg = regression_summary(y_test_reg, y_pred_reg)
print("Regression Metrics:")
display(metrics_reg)

In [None]:
# Actual vs Predicted
fig, ax = plot_prediction_error(
    y_test_reg, y_pred_reg,
    title='Actual vs Predicted',
    show_r2=True,
)
plt.show()

In [None]:
# Residuals Plot
fig, ax = plot_residuals(
    y_test_reg, y_pred_reg,
    title='Residuals vs Predicted',
)
plt.show()

In [None]:
# Residual Distribution
fig, axes = plot_residual_distribution(
    y_test_reg, y_pred_reg,
    title='Residual Analysis',
)
plt.show()

## 3. Feature Importance

In [None]:
# Feature importance from Random Forest
fig, ax = plot_feature_importance(
    clf.feature_importances_,
    feature_names=feature_names,
    top_n=15,
    title='Feature Importance (Random Forest)',
)
plt.show()

In [None]:
# Feature importance as dict
importance_dict = dict(zip(feature_names, clf.feature_importances_))

fig, ax = plot_feature_importance(
    importance_dict,
    top_n=10,
    title='Top 10 Features',
)
plt.show()

## 4. Learning Curves

In [None]:
# Learning curve
fig, ax = plot_learning_curve(
    RandomForestClassifier(n_estimators=50, random_state=42),
    X_clf, y_clf,
    cv=5,
    title='Learning Curve',
    n_jobs=-1,
)
plt.show()

In [None]:
# Validation curve (hyperparameter tuning)
fig, ax = plot_validation_curve(
    RandomForestClassifier(random_state=42),
    X_clf, y_clf,
    param_name='n_estimators',
    param_range=[10, 25, 50, 75, 100],
    cv=3,
    title='Validation Curve: n_estimators',
    n_jobs=-1,
)
plt.show()

## 5. Model Comparison

In [None]:
# Compare multiple models using ROC curves
from sklearn.svm import SVC

models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
}

# Train and get probabilities
model_probas = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    model_probas[name] = model.predict_proba(X_test)[:, 1]

# Plot ROC curves for all models
fig, ax = plot_roc_curve(
    y_test, model_probas,
    title='Model Comparison: ROC Curves',
)
plt.show()

In [None]:
# PR Curves comparison
fig, ax = plot_precision_recall_curve(
    y_test, model_probas,
    title='Model Comparison: Precision-Recall Curves',
)
plt.show()

In [None]:
# Calibration comparison
fig, ax = plot_calibration_curve(
    y_test, model_probas,
    title='Model Comparison: Calibration',
)
plt.show()

## Summary

This notebook demonstrated:
- Classification metrics and visualizations
- Regression metrics and residual analysis
- Feature importance plots
- Learning and validation curves
- Model comparison techniques