# Calibration curve

In [1]:
import numpy as np

# Machine Learning
from sklearn.datasets import make_classification
from sklearn.calibration import calibration_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression

# Data visualization
import matplotlib.pyplot as plt

In [9]:
def expected_calibration_error(y, proba, bins='fd'):
    bin_count, bin_edges = np.histogram(proba,
                                        bins=bins)
    n_bins = len(bin_count)
    bin_edges[0] -= 1e-8  # exclude the lft edge
    bin_id = np.digitize(proba,
                         bin_edges,
                         right=True) - 1
    bin_ysum = np.bincount(bin_id,
                           weights = y,
                           minlength = n_bins)
    bin_probasum = np.bincount(bin_id,
                               weights = proba,
                               minlength = n_bins)
    bin_ymean = np.divide(bin_ysum,
                          bin_count,
                          out = np.zeros(n_bins),
                          where = bin_count > 0)
    bin_probamean = np.divide(bin_probasum,
                              bin_count,
                              out = np.zeros(n_bins),
                              where = bin_count > 0)
    ece = np.abs((bin_probamean - bin_ymean) * bin_count).sum() / len(proba)
    return ece

In [3]:
# Create dataset
X, y = make_classification(
    n_samples = 15000,
    n_features = 50,
    n_informative = 30,
    n_redundant = 20,
    weights = [0.9, 0.1],
    random_state = 0
)

# Create train, validation, test datasets
X_train, X_valid, X_test = X[:5000], X[5000:10000], X[10000:]
y_train, y_valid, y_test = y[:5000], y[5000:10000], y[10000:]

#### Random Forest Classifier

In [13]:
# Train and fit a Random Forest Classifier to our training data
forest = RandomForestClassifier().fit(X_train, y_train)

print(f"Random Forest Classifier classes = {forest.classes_}")

# Get the class probabilities for the positive class only (second column) on our validation data
proba_valid = forest.predict_proba(X_valid)[:, 1]

# Predict probabilities on test set
proba_test_forest = forest.predict_proba(X_test)[:, 1]

Random Forest Classifier classes = [0 1]


#### Isotonic Regression

In [12]:
# Piece-wise linear model (function must be monotonic) and we fit this to our validation positive class probability
iso_reg = IsotonicRegression(y_min = 0,
                             y_max = 1,
                             out_of_bounds = 'clip').fit(proba_valid, y_valid)

# Predict probabilities on test set
proba_test_forest_isoreg = iso_reg.predict(forest.predict_proba(X_test)[:, 1])

#### Logistic Regression

In [7]:
# Fit Logistic regression model to our validation positive class probability
log_reg = LogisticRegression().fit(proba_valid.reshape(-1, 1), y_valid)

proba_test_forest_logreg = log_reg.predict_proba(forest.predict_proba(X_test)[:, 1].reshape(-1, 1))[:, 1]

#### Expected Calibration Error

In [18]:

ece_forest = expected_calibration_error(y_test,
                                        proba=proba_test_forest,
                                        bins='fd')
ece_forest_isoreg = expected_calibration_error(y_test,
                                               proba=proba_test_forest_isoreg,
                                               bins='fd')
ece_forest_logreg = expected_calibration_error(y_test,
                                               proba=proba_test_forest_logreg,
                                               bins='fd')

print(f"Random Forest: Expected Calibration Error = {100*ece_forest:.1f}%")
print(f"Random Forest & Isotonic Regression: Expected Calibration Error = {100*ece_forest_isoreg:.1f}%")
print(f"Random Forest & Logistic Regression: Expected Calibration Error = {100*ece_forest_logreg:.1f}%")

Random Forest: Expected Calibration Error = 7.0%
Random Forest & Isotonic Regression: Expected Calibration Error = 1.2%
Random Forest & Logistic Regression: Expected Calibration Error = 2.2%
