# Binary Classifier Evaluation Metrics

In [28]:
import sklearn.metrics
import numpy as np
import jax
import jax.numpy as jnp
from typing import List
import pandas as pd

## Trying Out the Metrics

In [3]:
# define some dummy observations and predictions
y_actual = np.array([0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0]) # N
y_pred = np.array([0.06, 0.92, 0.86, 0.03, 0.40, 0.70, 0.23, 0.4, 0.2, 0.8, 0.9, 0.65, 0.75, 0.4]) # N

In [25]:
# accuracy, notice that we have to threshold
print("Accuracy ", sklearn.metrics.accuracy_score(y_actual, y_pred > 0.5))

# accuracy under null model
# in real world, the mean would be based on TRAINING set
null_ypred = np.mean(y_actual) * np.ones_like(y_actual)
print("Accuracy (Null) ", sklearn.metrics.accuracy_score(y_actual, null_ypred > 0.5))

# balanced accuracy
print("Balanced Accuracy ", sklearn.metrics.balanced_accuracy_score(y_actual, y_pred > 0.5))

print("Balanced Accuracy (Null) ", sklearn.metrics.balanced_accuracy_score(y_actual, null_ypred > 0.5))


# recall
print("Recall ", sklearn.metrics.recall_score(y_actual, y_pred > 0.5))

# recall when saying yes all the time
print("Recall (Ones all the time) ", sklearn.metrics.recall_score(y_actual, np.ones_like(y_actual)))

# precision
print("Precision ", sklearn.metrics.precision_score(y_actual, y_pred > 0.5))

# precision when saying yes all the time
print("Precision (Ones all the time) ", sklearn.metrics.precision_score(y_actual, np.ones_like(y_actual)))


# precision when predicting one positive example that happens to be correct
yhat = np.zeros_like(y_actual)
yhat[1] = 1
print("Precision (Trivial) ", sklearn.metrics.precision_score(y_actual, yhat))

# f1 score
print("F1 ", sklearn.metrics.f1_score(y_actual, y_pred > 0.5))
print("F1 (Ones all the time)", sklearn.metrics.f1_score(y_actual, np.ones_like(y_actual)))

# AUC-ROC, notice: no thresholding
print("AUC-ROC ", sklearn.metrics.roc_auc_score(y_actual, y_pred))
print("AUC-PR ", sklearn.metrics.average_precision_score(y_actual, y_pred))
print("AUC-PR (Null) ", sklearn.metrics.average_precision_score(y_actual, np.mean(y_actual) * np.ones_like(y_actual)))


Accuracy  0.7142857142857143
Accuracy (Null)  0.6428571428571429
Balanced Accuracy  0.7333333333333334
Balanced Accuracy (Null)  0.5
Recall  0.6666666666666666
Recall (Ones all the time)  1.0
Precision  0.8571428571428571
Precision (Ones all the time)  0.6428571428571429
Precision (Trivial)  1.0
F1  0.75
F1 (Ones all the time) 0.782608695652174
AUC-ROC  0.7555555555555555
AUC-PR  0.8063492063492064
AUC-PR (Null)  0.6428571428571429


## Applying to Logistic Regression Model

In [54]:
def forward_fn(Beta, X):
    f = X @ Beta 
    p = 1/(1+jnp.exp(-f))
    return p 

def loss_fn(Beta, X, y):
    p = forward_fn(Beta, X)
    loss = -jnp.mean(y * jnp.log(p) + (1-y) * jnp.log(1-p))
    return loss 

class BinaryLinearModel():

    def __init__(self, 
                 features: List[str]):
        self._features = features 
    
    def train(self, rng, 
              df: pd.DataFrame, 
              y: np.ndarray, 
              epochs: int = 100, 
              eta: float = 0.01, 
              batch_size: int = 100000):
        grad_fn = jax.grad(loss_fn)

        y = jnp.array(y)
        
        # prepare inputs and outputs
        X = self._prepare_input_matrix(df[self._features])
        
        # randomly initialize solution 
        Beta = jax.random.normal(rng, X.shape[1]) # K

        # iterate for epochs
        history = []
        for i in range(epochs):

            # shuffle dataset (important)
            loop_key = jax.random.fold_in(rng, i)
            ix = jax.random.permutation(loop_key, X.shape[0])
            X = X[ix, :]
            y = y[ix]

            # go over mini batches and update
            for j in range(0, X.shape[0], batch_size):
                offset = j 
                end = j + batch_size

                # compute gradient
                # this is very powerful ... JAX takes care of derivative computation
                # so loss_fn could be as complex as you like
                Beta_grad = grad_fn(Beta, X[offset:end,:], y[offset:end])
                
                # update solution
                Beta = Beta - eta * Beta_grad

            # record epoch loss
            mse = loss_fn(Beta, X, y)
            history.append([Beta, mse])

        # save the parameters
        self._params, _ = history[-1]

        return history
    
    def _prepare_input_matrix(self, df: pd.DataFrame):

        # we need to separate categorical from numeric features
        # because they require separate processing
        # let's get categorical columns
        categorical_cols = df.select_dtypes(include='object').columns
        
        # let's get numeric
        ordinal_cols = df.select_dtypes(include='number').columns

        # construct input features
        X = df[ordinal_cols].to_numpy()

        # z-score (NxK' - 1xK') / 1xK' = NxK'
        X = (X - np.mean(X, axis=0)[None, :]) / np.std(X, axis=0)[None, :]

        # code categorical features
        for feature in categorical_cols:
            dummies = pd.get_dummies(df[feature]).to_numpy().astype(float)
            X = np.hstack((X, dummies)) 

        # add a column of ones
        ones_col = np.ones((X.shape[0], 1)) # Nx1
        X = np.hstack((ones_col, X)) # K
        
        return jnp.array(X) 
    
    def predict(self, df: pd.DataFrame):
         
        X = self._prepare_input_matrix(df[self._features])

        return forward_fn(self._params, X)
    
df = pd.read_csv("../data/separable_binary_data.csv")
df


rng = jax.random.key(52345)

#
# let's randomly split the data
#

# first, generate a shuffled permutation of indecies
ix = jax.random.permutation(rng, df.shape[0])
rng, _ = jax.random.split(rng)

# grab 80% of the shuffled data for training, rest is for testing
n_train = int(0.8 * df.shape[0])
train_ix = ix[:n_train]
test_ix = ix[n_train:]

train_df = df.iloc[train_ix]
test_df = df.iloc[test_ix]

model = BinaryLinearModel(['x1', 'x2'])

print("Train DF shape ", train_df.shape)
history = model.train(rng, train_df, train_df['y'].to_numpy(), epochs=20, eta=0.1, batch_size=10000)

# calculate null model on training data
mu = np.mean(train_df['y'].to_numpy())


Train DF shape  (800, 3)


In [55]:
# predict on test
print("Test DF shape ", test_df.shape)
yhat = model.predict(test_df)

# null model prediction
yhat_null = mu * np.ones(test_df.shape[0])

# hard decisions ...
threshold = 0.5 
yhat_hard = yhat > threshold
yhat_null_hard = yhat_null > threshold

ytrue = test_df['y'].to_numpy()

dict(
    accuracy = sklearn.metrics.accuracy_score(ytrue, yhat_hard),
    accuracy_null = sklearn.metrics.accuracy_score(ytrue, yhat_null_hard),
    
    balanced_accuracy = sklearn.metrics.balanced_accuracy_score(ytrue, yhat_hard),
    balanced_accuracy_null = sklearn.metrics.balanced_accuracy_score(ytrue, yhat_null_hard),

    recall = sklearn.metrics.recall_score(ytrue, yhat_hard),
    recall_null = sklearn.metrics.recall_score(ytrue, yhat_null_hard),

    precision = sklearn.metrics.precision_score(ytrue, yhat_hard),
    precision_null = sklearn.metrics.precision_score(ytrue, yhat_null_hard),

    f1 = sklearn.metrics.f1_score(ytrue, yhat_hard),
    f1_null = sklearn.metrics.f1_score(ytrue, yhat_null_hard),

    auc_roc = sklearn.metrics.roc_auc_score(ytrue, yhat),
    auc_pr = sklearn.metrics.average_precision_score(ytrue, yhat),
    auc_pr_null = sklearn.metrics.average_precision_score(ytrue, yhat_null),
)


Test DF shape  (200, 3)


{'accuracy': 0.7,
 'accuracy_null': 0.485,
 'balanced_accuracy': 0.7012311079971976,
 'balanced_accuracy_null': 0.5,
 'recall': 0.7422680412371134,
 'recall_null': 1.0,
 'precision': 0.6728971962616822,
 'precision_null': 0.485,
 'f1': 0.7058823529411765,
 'f1_null': 0.6531986531986532,
 'auc_roc': 0.7746972275047542,
 'auc_pr': 0.7838629030870375,
 'auc_pr_null': 0.485}