In [None]:
import numpy as np
import pandas as pd
import boto3
import sagemaker

sess = sagemaker.Session()
sagemaker_session_bucket = None
if sagemaker_session_bucket is None and sess is not None:
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

In [None]:
from sklearn.calibration import calibration_curve, CalibrationDisplay

In [None]:
pred = pd.read_pickle("predictions.pkl")

In [None]:
pred.head()

In [None]:
(pred.predicted > 0.9).mean()

In [None]:
(pred.actual == 1)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
fig, ax = plt.subplots(1, 1, figsize=(8, 6))
display = CalibrationDisplay.from_predictions(pred.actual, pred.predicted, n_bins=20, ax=ax)
plt.legend(loc='upper left')

In [None]:
sns.histplot(data=pred, x='predicted', binrange=(0, 1), bins=25)

In [None]:
pred.actual.value_counts()

In [None]:
from sklearn.metrics import confusion_matrix, roc_curve, auc

fpr, tpr, _ = roc_curve(pred.actual, pred.predicted)
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2
plt.plot(
    fpr,
    tpr,
    color="darkorange",
    lw=lw,
    label="ROC curve (area = %0.2f)" % roc_auc,
)
plt.plot([0, 1], [0, 1], color="navy", lw=lw, linestyle="--")
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Receiver operating characteristic example")
plt.legend(loc="lower right")
plt.show()

In [None]:
cm = confusion_matrix(pred.actual.astype(int), pred.predicted > 0.5, normalize='true')
sns.heatmap(cm, annot=True)
plt.title("Confusion matrix (recall)")

In [None]:
cm = confusion_matrix(pred.actual.astype(int), pred.predicted > 0.5, normalize='pred')
sns.heatmap(cm, annot=True)
plt.title("Confusion matrix (precision)")

In [None]:
cm = confusion_matrix(pred.actual.astype(int), pred.predicted > 0.95, normalize='pred')
sns.heatmap(cm, annot=True)
plt.title("Confusion matrix (precision)")

In [None]:
print("False positive examples")
print("="*30)
for idx, row in pred[(pred.actual == 0.0) & (pred.predicted > 0.5)].sample(3).iterrows():
    print(f"Q: {row.question}")
    print(f"A: {row.context}")
    print("-"*30)

In [None]:
print("False negative examples")
print("="*30)
for idx, row in pred[(pred.actual == 1.0) & (pred.predicted <= 0.5)].sample(3).iterrows():
    print(f"Q: {row.question}")
    print(f"A: {row.context}")
    print("-"*30)

In [None]:
print("True positive examples")
print("="*30)
for idx, row in pred[(pred.actual == 1.0) & (pred.predicted > 0.5)].sample(3).iterrows():
    print(f"Q: {row.question}")
    print(f"A: {row.context}")
    print("-"*30)

In [None]:
((pred.actual == 1.0) & (pred.predicted <= 0.5)).sum()

In [None]:
print("False negative examples")
print("="*30)
for idx, row in pred[(pred.actual == 1.0) & (pred.predicted <= 0.5)].sample(3).iterrows():
    print(f"Q: {row.question}")
    print(f"A: {row.context}")
    print("-"*30)

In [None]:
from collections import defaultdict
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    brier_score_loss,
    log_loss,
    roc_auc_score,
)

scores = defaultdict(list)
y_test = pred.actual
y_prob = pred.predicted
y_pred = (pred.predicted > 0.5).astype(int)
scores["Classifier"].append("BERT + YAL data")

for metric in [brier_score_loss, log_loss]:
    score_name = metric.__name__.replace("_", " ").replace("score", "").capitalize()
    scores[score_name].append(metric(y_test, y_prob))

for metric in [precision_score, recall_score, f1_score, roc_auc_score]:
    score_name = metric.__name__.replace("_", " ").replace("score", "").capitalize()
    scores[score_name].append(metric(y_test, y_pred))

score_df = pd.DataFrame(scores).set_index("Classifier")
score_df.round(decimals=3)

score_df